diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 2a5cb8abfa137..8f65904b8fe51 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -867,6 +867,86 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer, return acl_tensor; } +/** + * @brief Fills a tensor with a scalar value. + * + * This function fills the destination tensor `acl_dst` with the scalar value + * `scalar`. + * + * @param ctx The context for the CANN backend operations. + * @param scalar The scalar value used to fill the tensor. + * @param acl_dst The destination tensor to be filled with the scalar value. + */ +static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, + aclTensor* acl_dst) { + auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar); + ggml_cann_release_resources(ctx, acl_scalar); +} + +/** + * @brief Get or expand a cached float32 tensor filled with a scalar value. + * + * This function manages cached device memory for float32 tensors. If the current + * cache size is insufficient for the requested tensor shape, the old memory will + * be released and new memory will be allocated. The allocated buffer is then + * initialized either with zeros (when @p value == 0.0f) or with the given scalar + * value using CANN operations. Finally, an aclTensor object is created from the + * cached memory and returned. + * + * @param ctx The CANN backend context that manages device memory. + * @param buffer A pointer to the cached device buffer (will be allocated + * or reallocated if necessary). + * @param cache_element The current number of cached elements. This will be + * updated when the cache is expanded. + * @param ne The tensor shape array (number of elements in each dimension). + * @param nb The stride size for each dimension. + * @param dims The number of tensor dimensions. + * @param value The scalar value used to fill the tensor (supports zero + * initialization via memset or arbitrary values via fill_scalar). + * @return An aclTensor pointer created from the cached buffer. + */ +static aclTensor* get_f32_cache_acl_tensor( + ggml_backend_cann_context& ctx, + void** buffer, + int64_t &cache_element, + int64_t* ne, + size_t* nb, + int64_t dims, + float value) { + // Calculate total number of elements + int64_t n_element = 1; + for (int i = 0; i < dims; i++) { + n_element *= ne[i]; + } + size_t size = n_element * sizeof(float); + + // Allocate or expand cache if needed + if (cache_element < n_element) { + if (*buffer != nullptr) { + aclrtFree(*buffer); + *buffer = nullptr; + } + + ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST)); + cache_element = n_element; + + // Initialize cache + if (value == 0.0f) { + ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream())); + } else { + int64_t pool_ne[1] = { n_element }; + size_t pool_nb[1] = { sizeof(float) }; + aclTensor* acl_value = ggml_cann_create_tensor( + *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1); + aclnn_fill_scalar(ctx, 1, acl_value); + ggml_cann_release_resources(ctx, acl_value); + } + } + + return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims); +} + void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; @@ -875,20 +955,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); - size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); - ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); - - aclTensor* acl_gamma = aclnn_values( - ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, - ggml_cann_type_mapping(src->type), ggml_element_size(src)); - - size_t zero_tensor_n_bytes = - src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src); - ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes); - aclTensor* acl_rstd = - aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, - src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), - ggml_element_size(src)); + + // build gamma, one... + size_t acl_gamma_nb[GGML_MAX_DIMS]; + acl_gamma_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1]; + } + aclTensor* acl_gamma = get_f32_cache_acl_tensor( + ctx, + &ctx.f32_one_cache, + ctx.f32_one_cache_element, + src->ne, + acl_gamma_nb, + 1, // dims + 1.0f // value + ); + + // build rstd, zero... + size_t acl_rstd_nb[GGML_MAX_DIMS]; + acl_rstd_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1]; + } + aclTensor* acl_rstd = get_f32_cache_acl_tensor( + ctx, + &ctx.f32_zero_cache, + ctx.f32_zero_cache_element, + src->ne, + acl_rstd_nb, + GGML_MAX_DIMS, + 0.0f // value + ); + GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); } @@ -903,14 +1002,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, const int n_past = ((int32_t*)dst->op_params)[0]; - size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] * - src->ne[3] * ggml_element_size(src); - ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); + ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src)); + void* buffer = one_tensor_allocator.get(); - aclTensor* mask_tensor = - aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, - src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), - ggml_element_size(src), value); + aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type), + ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS); + + aclnn_fill_scalar(ctx, value, mask_tensor); aclScalar* alpha = nullptr; float alphaValue = 1.0f; @@ -1277,23 +1375,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, tmp_permute_tensor, tmp_mul_tensor, acl_dst); } -/** - * @brief Fills a tensor with a scalar value. - * - * This function fills the destination tensor `acl_dst` with the scalar value - * `scalar`. - * - * @param ctx The context for the CANN backend operations. - * @param scalar The scalar value used to fill the tensor. - * @param acl_dst The destination tensor to be filled with the scalar value. - */ -static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, - aclTensor* acl_dst) { - auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); - GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar); - ggml_cann_release_resources(ctx, acl_scalar); -} - /** * @brief Raises each element of a tensor to the power of the corresponding * element in another tensor. diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 2c2033bfba857..5858bd3f6a197 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -379,6 +379,10 @@ struct ggml_backend_cann_context { cann_task_queue task_queue; bool async_mode; bool support_set_rows; + void* f32_zero_cache = nullptr; + void* f32_one_cache = nullptr; + int64_t f32_zero_cache_element = 0; + int64_t f32_one_cache_element = 0; aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */