@@ -867,6 +867,86 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
867
867
return acl_tensor;
868
868
}
869
869
870
+ /* *
871
+ * @brief Fills a tensor with a scalar value.
872
+ *
873
+ * This function fills the destination tensor `acl_dst` with the scalar value
874
+ * `scalar`.
875
+ *
876
+ * @param ctx The context for the CANN backend operations.
877
+ * @param scalar The scalar value used to fill the tensor.
878
+ * @param acl_dst The destination tensor to be filled with the scalar value.
879
+ */
880
+ static void aclnn_fill_scalar (ggml_backend_cann_context& ctx, float scalar,
881
+ aclTensor* acl_dst) {
882
+ auto acl_scalar = aclCreateScalar (&scalar, aclDataType::ACL_FLOAT);
883
+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceFillScalar, acl_dst, acl_scalar);
884
+ ggml_cann_release_resources (ctx, acl_scalar);
885
+ }
886
+
887
+ /* *
888
+ * @brief Get or expand a cached float32 tensor filled with a scalar value.
889
+ *
890
+ * This function manages cached device memory for float32 tensors. If the current
891
+ * cache size is insufficient for the requested tensor shape, the old memory will
892
+ * be released and new memory will be allocated. The allocated buffer is then
893
+ * initialized either with zeros (when @p value == 0.0f) or with the given scalar
894
+ * value using CANN operations. Finally, an aclTensor object is created from the
895
+ * cached memory and returned.
896
+ *
897
+ * @param ctx The CANN backend context that manages device memory.
898
+ * @param buffer A pointer to the cached device buffer (will be allocated
899
+ * or reallocated if necessary).
900
+ * @param cache_element The current number of cached elements. This will be
901
+ * updated when the cache is expanded.
902
+ * @param ne The tensor shape array (number of elements in each dimension).
903
+ * @param nb The stride size for each dimension.
904
+ * @param dims The number of tensor dimensions.
905
+ * @param value The scalar value used to fill the tensor (supports zero
906
+ * initialization via memset or arbitrary values via fill_scalar).
907
+ * @return An aclTensor pointer created from the cached buffer.
908
+ */
909
+ static aclTensor* get_f32_cache_acl_tensor (
910
+ ggml_backend_cann_context& ctx,
911
+ void ** buffer,
912
+ int64_t &cache_element,
913
+ int64_t * ne,
914
+ size_t * nb,
915
+ int64_t dims,
916
+ float value) {
917
+ // Calculate total number of elements
918
+ int64_t n_element = 1 ;
919
+ for (int i = 0 ; i < dims; i++) {
920
+ n_element *= ne[i];
921
+ }
922
+ size_t size = n_element * sizeof (float );
923
+
924
+ // Allocate or expand cache if needed
925
+ if (cache_element < n_element) {
926
+ if (*buffer != nullptr ) {
927
+ aclrtFree (*buffer);
928
+ *buffer = nullptr ;
929
+ }
930
+
931
+ ACL_CHECK (aclrtMalloc (buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
932
+ cache_element = n_element;
933
+
934
+ // Initialize cache
935
+ if (value == 0 .0f ) {
936
+ ACL_CHECK (aclrtMemsetAsync (*buffer, size, 0 , size, ctx.stream ()));
937
+ } else {
938
+ int64_t pool_ne[1 ] = { n_element };
939
+ size_t pool_nb[1 ] = { sizeof (float ) };
940
+ aclTensor* acl_value = ggml_cann_create_tensor (
941
+ *buffer, ACL_FLOAT, sizeof (float ), pool_ne, pool_nb, 1 );
942
+ aclnn_fill_scalar (ctx, 1 , acl_value);
943
+ ggml_cann_release_resources (ctx, acl_value);
944
+ }
945
+ }
946
+
947
+ return ggml_cann_create_tensor (*buffer, ACL_FLOAT, sizeof (float ), ne, nb, dims);
948
+ }
949
+
870
950
void ggml_cann_rms_norm (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
871
951
ggml_tensor* src = dst->src [0 ];
872
952
@@ -875,20 +955,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
875
955
876
956
float eps;
877
957
memcpy (&eps, dst->op_params , sizeof (float ));
878
- size_t one_tensor_n_bytes = src->ne [0 ] * ggml_element_size (src);
879
- ggml_cann_pool_alloc one_tensor_allocator (ctx.pool (), one_tensor_n_bytes);
880
-
881
- aclTensor* acl_gamma = aclnn_values (
882
- ctx, one_tensor_allocator.get (), one_tensor_n_bytes, src->ne , 1 ,
883
- ggml_cann_type_mapping (src->type ), ggml_element_size (src));
884
-
885
- size_t zero_tensor_n_bytes =
886
- src->ne [1 ] * src->ne [2 ] * src->ne [3 ] * ggml_element_size (src);
887
- ggml_cann_pool_alloc zero_tensor_allocator (ctx.pool (), zero_tensor_n_bytes);
888
- aclTensor* acl_rstd =
889
- aclnn_zero (ctx, zero_tensor_allocator.get (), zero_tensor_n_bytes,
890
- src->ne , GGML_MAX_DIMS, ggml_cann_type_mapping (src->type ),
891
- ggml_element_size (src));
958
+
959
+ // build gamma, one...
960
+ size_t acl_gamma_nb[GGML_MAX_DIMS];
961
+ acl_gamma_nb[0 ] = sizeof (float );
962
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
963
+ acl_gamma_nb[i] = acl_gamma_nb[i - 1 ] * src->ne [i - 1 ];
964
+ }
965
+ aclTensor* acl_gamma = get_f32_cache_acl_tensor (
966
+ ctx,
967
+ &ctx.f32_one_cache ,
968
+ ctx.f32_one_cache_element ,
969
+ src->ne ,
970
+ acl_gamma_nb,
971
+ 1 , // dims
972
+ 1 .0f // value
973
+ );
974
+
975
+ // build rstd, zero...
976
+ size_t acl_rstd_nb[GGML_MAX_DIMS];
977
+ acl_rstd_nb[0 ] = sizeof (float );
978
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
979
+ acl_rstd_nb[i] = acl_rstd_nb[i - 1 ] * src->ne [i - 1 ];
980
+ }
981
+ aclTensor* acl_rstd = get_f32_cache_acl_tensor (
982
+ ctx,
983
+ &ctx.f32_zero_cache ,
984
+ ctx.f32_zero_cache_element ,
985
+ src->ne ,
986
+ acl_rstd_nb,
987
+ GGML_MAX_DIMS,
988
+ 0 .0f // value
989
+ );
990
+
892
991
GGML_CANN_CALL_ACLNN_OP (ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
893
992
ggml_cann_release_resources (ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
894
993
}
@@ -903,14 +1002,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
903
1002
904
1003
const int n_past = ((int32_t *)dst->op_params )[0 ];
905
1004
906
- size_t one_tensor_n_bytes = src->ne [0 ] * src->ne [1 ] * src->ne [2 ] *
907
- src->ne [3 ] * ggml_element_size (src);
908
- ggml_cann_pool_alloc one_tensor_allocator (ctx.pool (), one_tensor_n_bytes);
1005
+ ggml_cann_pool_alloc one_tensor_allocator (ctx.pool (), ggml_nbytes (src));
1006
+ void * buffer = one_tensor_allocator.get ();
909
1007
910
- aclTensor* mask_tensor =
911
- aclnn_values (ctx, one_tensor_allocator. get (), one_tensor_n_bytes,
912
- src-> ne , GGML_MAX_DIMS, ggml_cann_type_mapping (src-> type ),
913
- ggml_element_size (src) , value);
1008
+ aclTensor* mask_tensor = ggml_cann_create_tensor (buffer, ggml_cann_type_mapping (src-> type ),
1009
+ ggml_type_size (src-> type ), src-> ne , src-> nb , GGML_MAX_DIMS);
1010
+
1011
+ aclnn_fill_scalar (ctx , value, mask_tensor );
914
1012
915
1013
aclScalar* alpha = nullptr ;
916
1014
float alphaValue = 1 .0f ;
@@ -1277,23 +1375,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
1277
1375
tmp_permute_tensor, tmp_mul_tensor, acl_dst);
1278
1376
}
1279
1377
1280
- /* *
1281
- * @brief Fills a tensor with a scalar value.
1282
- *
1283
- * This function fills the destination tensor `acl_dst` with the scalar value
1284
- * `scalar`.
1285
- *
1286
- * @param ctx The context for the CANN backend operations.
1287
- * @param scalar The scalar value used to fill the tensor.
1288
- * @param acl_dst The destination tensor to be filled with the scalar value.
1289
- */
1290
- static void aclnn_fill_scalar (ggml_backend_cann_context& ctx, float scalar,
1291
- aclTensor* acl_dst) {
1292
- auto acl_scalar = aclCreateScalar (&scalar, aclDataType::ACL_FLOAT);
1293
- GGML_CANN_CALL_ACLNN_OP (ctx, InplaceFillScalar, acl_dst, acl_scalar);
1294
- ggml_cann_release_resources (ctx, acl_scalar);
1295
- }
1296
-
1297
1378
/* *
1298
1379
* @brief Raises each element of a tensor to the power of the corresponding
1299
1380
* element in another tensor.
0 commit comments