From 0344d58e1408864f0970bfa3111d4bf3b2bad8f5 Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Tue, 19 Aug 2025 09:31:18 +0000 Subject: [PATCH 1/4] [CANN] Optimize RMS_NORM using cache Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/aclnn_ops.cpp | 138 ++++++++++++++++++++++++------- ggml/src/ggml-cann/common.h | 3 + 2 files changed, 111 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 2a5cb8abfa137..507aa2912da06 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -867,6 +867,98 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer, return acl_tensor; } +/** + * @brief Fills a tensor with a scalar value. + * + * This function fills the destination tensor `acl_dst` with the scalar value + * `scalar`. + * + * @param ctx The context for the CANN backend operations. + * @param scalar The scalar value used to fill the tensor. + * @param acl_dst The destination tensor to be filled with the scalar value. + */ +static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, + aclTensor* acl_dst) { + auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar); + ggml_cann_release_resources(ctx, acl_scalar); +} + +/** + * @brief Get or expand cached float32 tensors filled with scalar values. + * + * This function manages a cache of float32 tensors (zero-filled and one-filled). + * If the cache does not exist, it will initialize the cache with a zero tensor + * and a one tensor. If the requested tensor size exceeds the current cache + * capacity, the cache will be expanded accordingly. The function then returns + * an aclTensor created from the cached memory (either zero-filled or one-filled), + * depending on the input `value`. + * + * @param ctx The CANN backend context that manages cache memory. + * @param ne The tensor shape array (number of elements in each dimension). + * @param nb The stride size for each dimension. + * @param dims The number of tensor dimensions. + * @param value The scalar value (only supports 0 or 1) used to determine whether + * to return the zero-cache tensor or the one-cache tensor. + * @return An aclTensor pointer corresponding to the cached tensor. + */ +static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, + int64_t* ne, size_t* nb, + int64_t dims, int64_t value) { + // init cache + if(ctx.f32_zero_cache == nullptr) { + // zero-cache pool init + size_t size = ctx.f32_cache_element * sizeof(float); + ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); + + // one-cache pool init + int64_t pool_ne[1] = { ctx.f32_cache_element }; + size_t pool_nb[1] = { sizeof(float) }; + ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); + aclTensor* acl_one = ggml_cann_create_tensor( + ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, + 1); + aclnn_fill_scalar(ctx, 1, acl_one); + ggml_cann_release_resources(ctx, acl_one); + } + + // Cache expansion + int64_t n_element = 1; + for(int i = 0; i < dims; i++) { + n_element = n_element * ne[i]; + } + if (ctx.f32_cache_element < n_element) { + // free old mem + aclrtFree(ctx.f32_zero_cache); + aclrtFree(ctx.f32_one_cache); + // init zero cache + ctx.f32_cache_element = n_element; + size_t size = n_element * sizeof(float); + ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); + + // one-cache pool init + int64_t pool_ne[1] = { n_element }; + size_t pool_nb[1] = { sizeof(float) }; + ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); + aclTensor* acl_one = ggml_cann_create_tensor( + ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, + 1); + aclnn_fill_scalar(ctx, 1, acl_one); + ggml_cann_release_resources(ctx, acl_one); + } + + void* cache; + if (value == 0) { + cache = ctx.f32_zero_cache; + } else { + cache = ctx.f32_one_cache; + } + + return ggml_cann_create_tensor(cache, ACL_FLOAT, sizeof(float), ne, nb, dims); +} + void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; @@ -875,20 +967,23 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); - size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); - ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); - aclTensor* acl_gamma = aclnn_values( - ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, - ggml_cann_type_mapping(src->type), ggml_element_size(src)); - - size_t zero_tensor_n_bytes = - src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src); - ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes); - aclTensor* acl_rstd = - aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, - src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), - ggml_element_size(src)); + // build gamma, one... + size_t acl_gamma_nb[GGML_MAX_DIMS]; + acl_gamma_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1]; + } + aclTensor* acl_gamma = get_f32_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1); + + // build rstd, zero... + size_t acl_rstd_nb[GGML_MAX_DIMS]; + acl_rstd_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1]; + } + aclTensor* acl_rstd = get_f32_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0); + GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); } @@ -1277,23 +1372,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, tmp_permute_tensor, tmp_mul_tensor, acl_dst); } -/** - * @brief Fills a tensor with a scalar value. - * - * This function fills the destination tensor `acl_dst` with the scalar value - * `scalar`. - * - * @param ctx The context for the CANN backend operations. - * @param scalar The scalar value used to fill the tensor. - * @param acl_dst The destination tensor to be filled with the scalar value. - */ -static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, - aclTensor* acl_dst) { - auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); - GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar); - ggml_cann_release_resources(ctx, acl_scalar); -} - /** * @brief Raises each element of a tensor to the power of the corresponding * element in another tensor. diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 2c2033bfba857..f61ea9e5db20e 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -379,6 +379,9 @@ struct ggml_backend_cann_context { cann_task_queue task_queue; bool async_mode; bool support_set_rows; + void* f32_zero_cache = nullptr; + void* f32_one_cache = nullptr; + int64_t f32_cache_element = 1024 * 1024; aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ From 9b0ec0ead7e966a8f9b3ab315c2e7c505eedc48b Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Tue, 19 Aug 2025 09:42:02 +0000 Subject: [PATCH 2/4] fix typo Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/aclnn_ops.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 507aa2912da06..80e9950c6611c 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -886,12 +886,12 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, /** * @brief Get or expand cached float32 tensors filled with scalar values. - * - * This function manages a cache of float32 tensors (zero-filled and one-filled). - * If the cache does not exist, it will initialize the cache with a zero tensor - * and a one tensor. If the requested tensor size exceeds the current cache - * capacity, the cache will be expanded accordingly. The function then returns - * an aclTensor created from the cached memory (either zero-filled or one-filled), + * + * This function manages a cache of float32 tensors (zero-filled and one-filled). + * If the cache does not exist, it will initialize the cache with a zero tensor + * and a one tensor. If the requested tensor size exceeds the current cache + * capacity, the cache will be expanded accordingly. The function then returns + * an aclTensor created from the cached memory (either zero-filled or one-filled), * depending on the input `value`. * * @param ctx The CANN backend context that manages cache memory. @@ -902,7 +902,7 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, * to return the zero-cache tensor or the one-cache tensor. * @return An aclTensor pointer corresponding to the cached tensor. */ -static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, +static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, int64_t* ne, size_t* nb, int64_t dims, int64_t value) { // init cache @@ -911,10 +911,10 @@ static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, size_t size = ctx.f32_cache_element * sizeof(float); ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); - + // one-cache pool init - int64_t pool_ne[1] = { ctx.f32_cache_element }; - size_t pool_nb[1] = { sizeof(float) }; + int64_t pool_ne[1] = { ctx.f32_cache_element }; + size_t pool_nb[1] = { sizeof(float) }; ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); aclTensor* acl_one = ggml_cann_create_tensor( ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, @@ -937,10 +937,10 @@ static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, size_t size = n_element * sizeof(float); ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); - + // one-cache pool init - int64_t pool_ne[1] = { n_element }; - size_t pool_nb[1] = { sizeof(float) }; + int64_t pool_ne[1] = { n_element }; + size_t pool_nb[1] = { sizeof(float) }; ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); aclTensor* acl_one = ggml_cann_create_tensor( ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, @@ -948,14 +948,14 @@ static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, aclnn_fill_scalar(ctx, 1, acl_one); ggml_cann_release_resources(ctx, acl_one); } - + void* cache; if (value == 0) { cache = ctx.f32_zero_cache; } else { cache = ctx.f32_one_cache; } - + return ggml_cann_create_tensor(cache, ACL_FLOAT, sizeof(float), ne, nb, dims); } @@ -983,7 +983,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1]; } aclTensor* acl_rstd = get_f32_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0); - + GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); } From c24b995a1ba1bed308cb5df761ca12036a4a3c33 Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Wed, 20 Aug 2025 03:07:35 +0000 Subject: [PATCH 3/4] fix review comment Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/aclnn_ops.cpp | 90 +++++++++++++++----------------- ggml/src/ggml-cann/common.h | 3 +- 2 files changed, 43 insertions(+), 50 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 80e9950c6611c..ef80236f64818 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -902,57 +902,50 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, * to return the zero-cache tensor or the one-cache tensor. * @return An aclTensor pointer corresponding to the cached tensor. */ -static aclTensor* get_f32_cache_acl_tensor(ggml_backend_cann_context& ctx, +static aclTensor* get_f32_zero_or_one_cache_acl_tensor(ggml_backend_cann_context& ctx, int64_t* ne, size_t* nb, int64_t dims, int64_t value) { - // init cache - if(ctx.f32_zero_cache == nullptr) { - // zero-cache pool init - size_t size = ctx.f32_cache_element * sizeof(float); - ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); - - // one-cache pool init - int64_t pool_ne[1] = { ctx.f32_cache_element }; - size_t pool_nb[1] = { sizeof(float) }; - ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); - aclTensor* acl_one = ggml_cann_create_tensor( - ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, - 1); - aclnn_fill_scalar(ctx, 1, acl_one); - ggml_cann_release_resources(ctx, acl_one); - } + // just support one and zero cache + GGML_ASSERT(value == 1 || value == 0); - // Cache expansion + // Cache init or expansion int64_t n_element = 1; for(int i = 0; i < dims; i++) { n_element = n_element * ne[i]; } - if (ctx.f32_cache_element < n_element) { - // free old mem - aclrtFree(ctx.f32_zero_cache); - aclrtFree(ctx.f32_one_cache); - // init zero cache - ctx.f32_cache_element = n_element; - size_t size = n_element * sizeof(float); - ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); - - // one-cache pool init - int64_t pool_ne[1] = { n_element }; - size_t pool_nb[1] = { sizeof(float) }; - ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); - aclTensor* acl_one = ggml_cann_create_tensor( - ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, - 1); - aclnn_fill_scalar(ctx, 1, acl_one); - ggml_cann_release_resources(ctx, acl_one); - } - + size_t size = n_element * sizeof(float); void* cache; if (value == 0) { + if(ctx.f32_zero_cache_element < n_element){ + //free old mem + if(ctx.f32_zero_cache != nullptr){ + aclrtFree(ctx.f32_zero_cache); + } + + //init zero cache + ctx.f32_zero_cache_element = n_element; + ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); + } cache = ctx.f32_zero_cache; } else { + if(ctx.f32_one_cache_element < n_element){ + //free old mem + if(ctx.f32_one_cache != nullptr){ + aclrtFree(ctx.f32_one_cache); + } + + //init one cache + ctx.f32_one_cache_element = n_element; + int64_t pool_ne[1] = { n_element }; + size_t pool_nb[1] = { sizeof(float) }; + ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); + aclTensor* acl_one = ggml_cann_create_tensor( + ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, + 1); + aclnn_fill_scalar(ctx, 1, acl_one); + ggml_cann_release_resources(ctx, acl_one); + } cache = ctx.f32_one_cache; } @@ -974,7 +967,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1]; } - aclTensor* acl_gamma = get_f32_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1); + aclTensor* acl_gamma = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1); // build rstd, zero... size_t acl_rstd_nb[GGML_MAX_DIMS]; @@ -982,7 +975,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1]; } - aclTensor* acl_rstd = get_f32_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0); + aclTensor* acl_rstd = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0); GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); @@ -998,14 +991,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, const int n_past = ((int32_t*)dst->op_params)[0]; - size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] * - src->ne[3] * ggml_element_size(src); - ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); + ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src)); + void* buffer = one_tensor_allocator.get(); - aclTensor* mask_tensor = - aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, - src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), - ggml_element_size(src), value); + aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type), + ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS); + + aclnn_fill_scalar(ctx, value, mask_tensor); aclScalar* alpha = nullptr; float alphaValue = 1.0f; diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index f61ea9e5db20e..5858bd3f6a197 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -381,7 +381,8 @@ struct ggml_backend_cann_context { bool support_set_rows; void* f32_zero_cache = nullptr; void* f32_one_cache = nullptr; - int64_t f32_cache_element = 1024 * 1024; + int64_t f32_zero_cache_element = 0; + int64_t f32_one_cache_element = 0; aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ From 3c87db443ba6292af7c1f6145e2b9ab0219f56c2 Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Wed, 20 Aug 2025 07:02:38 +0000 Subject: [PATCH 4/4] codestyle adjustment Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/aclnn_ops.cpp | 127 +++++++++++++++++-------------- 1 file changed, 69 insertions(+), 58 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index ef80236f64818..8f65904b8fe51 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -885,71 +885,66 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, } /** - * @brief Get or expand cached float32 tensors filled with scalar values. - * - * This function manages a cache of float32 tensors (zero-filled and one-filled). - * If the cache does not exist, it will initialize the cache with a zero tensor - * and a one tensor. If the requested tensor size exceeds the current cache - * capacity, the cache will be expanded accordingly. The function then returns - * an aclTensor created from the cached memory (either zero-filled or one-filled), - * depending on the input `value`. - * - * @param ctx The CANN backend context that manages cache memory. - * @param ne The tensor shape array (number of elements in each dimension). - * @param nb The stride size for each dimension. - * @param dims The number of tensor dimensions. - * @param value The scalar value (only supports 0 or 1) used to determine whether - * to return the zero-cache tensor or the one-cache tensor. - * @return An aclTensor pointer corresponding to the cached tensor. + * @brief Get or expand a cached float32 tensor filled with a scalar value. + * + * This function manages cached device memory for float32 tensors. If the current + * cache size is insufficient for the requested tensor shape, the old memory will + * be released and new memory will be allocated. The allocated buffer is then + * initialized either with zeros (when @p value == 0.0f) or with the given scalar + * value using CANN operations. Finally, an aclTensor object is created from the + * cached memory and returned. + * + * @param ctx The CANN backend context that manages device memory. + * @param buffer A pointer to the cached device buffer (will be allocated + * or reallocated if necessary). + * @param cache_element The current number of cached elements. This will be + * updated when the cache is expanded. + * @param ne The tensor shape array (number of elements in each dimension). + * @param nb The stride size for each dimension. + * @param dims The number of tensor dimensions. + * @param value The scalar value used to fill the tensor (supports zero + * initialization via memset or arbitrary values via fill_scalar). + * @return An aclTensor pointer created from the cached buffer. */ -static aclTensor* get_f32_zero_or_one_cache_acl_tensor(ggml_backend_cann_context& ctx, - int64_t* ne, size_t* nb, - int64_t dims, int64_t value) { - // just support one and zero cache - GGML_ASSERT(value == 1 || value == 0); - - // Cache init or expansion +static aclTensor* get_f32_cache_acl_tensor( + ggml_backend_cann_context& ctx, + void** buffer, + int64_t &cache_element, + int64_t* ne, + size_t* nb, + int64_t dims, + float value) { + // Calculate total number of elements int64_t n_element = 1; - for(int i = 0; i < dims; i++) { - n_element = n_element * ne[i]; + for (int i = 0; i < dims; i++) { + n_element *= ne[i]; } size_t size = n_element * sizeof(float); - void* cache; - if (value == 0) { - if(ctx.f32_zero_cache_element < n_element){ - //free old mem - if(ctx.f32_zero_cache != nullptr){ - aclrtFree(ctx.f32_zero_cache); - } - - //init zero cache - ctx.f32_zero_cache_element = n_element; - ACL_CHECK(aclrtMalloc(&ctx.f32_zero_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMemsetAsync(ctx.f32_zero_cache, size, 0, size, ctx.stream())); + + // Allocate or expand cache if needed + if (cache_element < n_element) { + if (*buffer != nullptr) { + aclrtFree(*buffer); + *buffer = nullptr; } - cache = ctx.f32_zero_cache; - } else { - if(ctx.f32_one_cache_element < n_element){ - //free old mem - if(ctx.f32_one_cache != nullptr){ - aclrtFree(ctx.f32_one_cache); - } - - //init one cache - ctx.f32_one_cache_element = n_element; + + ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST)); + cache_element = n_element; + + // Initialize cache + if (value == 0.0f) { + ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream())); + } else { int64_t pool_ne[1] = { n_element }; size_t pool_nb[1] = { sizeof(float) }; - ACL_CHECK(aclrtMalloc(&ctx.f32_one_cache, size, ACL_MEM_MALLOC_HUGE_FIRST)); - aclTensor* acl_one = ggml_cann_create_tensor( - ctx.f32_one_cache, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, - 1); - aclnn_fill_scalar(ctx, 1, acl_one); - ggml_cann_release_resources(ctx, acl_one); + aclTensor* acl_value = ggml_cann_create_tensor( + *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1); + aclnn_fill_scalar(ctx, 1, acl_value); + ggml_cann_release_resources(ctx, acl_value); } - cache = ctx.f32_one_cache; } - return ggml_cann_create_tensor(cache, ACL_FLOAT, sizeof(float), ne, nb, dims); + return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims); } void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { @@ -967,7 +962,15 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1]; } - aclTensor* acl_gamma = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_gamma_nb, 1, 1); + aclTensor* acl_gamma = get_f32_cache_acl_tensor( + ctx, + &ctx.f32_one_cache, + ctx.f32_one_cache_element, + src->ne, + acl_gamma_nb, + 1, // dims + 1.0f // value + ); // build rstd, zero... size_t acl_rstd_nb[GGML_MAX_DIMS]; @@ -975,7 +978,15 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1]; } - aclTensor* acl_rstd = get_f32_zero_or_one_cache_acl_tensor(ctx, src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0); + aclTensor* acl_rstd = get_f32_cache_acl_tensor( + ctx, + &ctx.f32_zero_cache, + ctx.f32_zero_cache_element, + src->ne, + acl_rstd_nb, + GGML_MAX_DIMS, + 0.0f // value + ); GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); @@ -996,7 +1007,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS); - + aclnn_fill_scalar(ctx, value, mask_tensor); aclScalar* alpha = nullptr;