@@ -894,14 +894,13 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
894
894
}
895
895
896
896
/* *
897
- * @brief Get or expand a cached float32 tensor filled with a scalar value.
897
+ * @brief Get or expand a cached tensor filled with a scalar value.
898
898
*
899
- * This function manages cached device memory for float32 tensors. If the current
899
+ * This function manages cached device memory for tensors. If the current
900
900
* cache size is insufficient for the requested tensor shape, the old memory will
901
- * be released and new memory will be allocated. The allocated buffer is then
902
- * initialized either with zeros (when @p value == 0.0f) or with the given scalar
903
- * value using CANN operations. Finally, an aclTensor object is created from the
904
- * cached memory and returned.
901
+ * be released and new memory will be allocated. The allocated buffer is
902
+ * initialized with the given scalar value using CANN operations.
903
+ * Finally, an aclTensor object is created from the cached memory and returned.
905
904
*
906
905
* @param ctx The CANN backend context that manages device memory.
907
906
* @param buffer A pointer to the cached device buffer (will be allocated
@@ -910,25 +909,27 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
910
909
* updated when the cache is expanded.
911
910
* @param ne The tensor shape array (number of elements in each dimension).
912
911
* @param nb The stride size for each dimension.
912
+ * @param dtype Data type of cached tensor.
913
913
* @param dims The number of tensor dimensions.
914
914
* @param value The scalar value used to fill the tensor (supports zero
915
915
* initialization via memset or arbitrary values via fill_scalar).
916
916
* @return An aclTensor pointer created from the cached buffer.
917
917
*/
918
- static aclTensor* get_f32_cache_acl_tensor (
918
+ static aclTensor* get_cache_acl_tensor (
919
919
ggml_backend_cann_context& ctx,
920
920
void ** buffer,
921
921
int64_t &cache_element,
922
922
int64_t * ne,
923
923
size_t * nb,
924
+ ggml_type dtype,
924
925
int64_t dims,
925
926
float value) {
926
927
// Calculate total number of elements
927
928
int64_t n_element = 1 ;
928
929
for (int i = 0 ; i < dims; i++) {
929
930
n_element *= ne[i];
930
931
}
931
- size_t size = n_element * sizeof ( float );
932
+ size_t size = n_element * ggml_type_size (dtype );
932
933
933
934
// Allocate or expand cache if needed
934
935
if (cache_element < n_element) {
@@ -941,19 +942,17 @@ static aclTensor* get_f32_cache_acl_tensor(
941
942
cache_element = n_element;
942
943
943
944
// Initialize cache
944
- if (value == 0 .0f ) {
945
- ACL_CHECK (aclrtMemsetAsync (*buffer, size, 0 , size, ctx.stream ()));
946
- } else {
947
- int64_t pool_ne[1 ] = { n_element };
948
- size_t pool_nb[1 ] = { sizeof (float ) };
949
- aclTensor* acl_value = ggml_cann_create_tensor (
950
- *buffer, ACL_FLOAT, sizeof (float ), pool_ne, pool_nb, 1 );
951
- aclnn_fill_scalar (ctx, 1 , acl_value);
952
- ggml_cann_release_resources (ctx, acl_value);
953
- }
945
+ int64_t pool_ne[1 ] = { n_element };
946
+ size_t pool_nb[1 ] = { ggml_type_size (dtype) };
947
+ aclTensor* acl_value = ggml_cann_create_tensor (
948
+ *buffer, ggml_cann_type_mapping (dtype), ggml_type_size (dtype),
949
+ pool_ne, pool_nb, 1 );
950
+ aclnn_fill_scalar (ctx, value, acl_value);
951
+ ggml_cann_release_resources (ctx, acl_value);
954
952
}
955
953
956
- return ggml_cann_create_tensor (*buffer, ACL_FLOAT, sizeof (float ), ne, nb, dims);
954
+ return ggml_cann_create_tensor (*buffer, ggml_cann_type_mapping (dtype),
955
+ ggml_type_size (dtype), ne, nb, dims);
957
956
}
958
957
959
958
void ggml_cann_rms_norm (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -965,35 +964,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
965
964
float eps;
966
965
memcpy (&eps, dst->op_params , sizeof (float ));
967
966
968
- // build gamma, one.. .
967
+ // build gamma.
969
968
size_t acl_gamma_nb[GGML_MAX_DIMS];
970
- acl_gamma_nb[0 ] = sizeof (float );
969
+ // gamma's type is the same with dst.
970
+ acl_gamma_nb[0 ] = ggml_type_size (dst->type );
971
971
for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
972
972
acl_gamma_nb[i] = acl_gamma_nb[i - 1 ] * src->ne [i - 1 ];
973
973
}
974
- aclTensor* acl_gamma = get_f32_cache_acl_tensor (
974
+ aclTensor* acl_gamma = get_cache_acl_tensor (
975
975
ctx,
976
976
&ctx.rms_norm_one_tensor_cache .cache ,
977
977
ctx.rms_norm_one_tensor_cache .size ,
978
978
src->ne ,
979
979
acl_gamma_nb,
980
+ dst->type ,
980
981
1 , // dims
981
982
1 .0f // value
982
983
);
983
984
984
- // build rstd, zero.. .
985
+ // build rstd.
985
986
int64_t acl_rstd_ne[] = {src->ne [1 ], src->ne [2 ], src->ne [3 ]};
986
987
size_t acl_rstd_nb[GGML_MAX_DIMS - 1 ];
988
+ // rstd will always be F32.
987
989
acl_rstd_nb[0 ] = sizeof (float );
988
990
for (int i = 1 ; i < GGML_MAX_DIMS - 1 ; i++) {
989
991
acl_rstd_nb[i] = acl_rstd_nb[i - 1 ] * acl_rstd_ne[i - 1 ];
990
992
}
991
- aclTensor* acl_rstd = get_f32_cache_acl_tensor (
993
+ aclTensor* acl_rstd = get_cache_acl_tensor (
992
994
ctx,
993
995
&ctx.rms_norm_zero_tensor_cache .cache ,
994
996
ctx.rms_norm_zero_tensor_cache .size ,
995
997
acl_rstd_ne,
996
998
acl_rstd_nb,
999
+ GGML_TYPE_F32,
997
1000
GGML_MAX_DIMS - 1 ,
998
1001
0 .0f // value
999
1002
);
@@ -1765,41 +1768,42 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1765
1768
ggml_tensor* src0 = dst->src [0 ]; // src
1766
1769
ggml_tensor* src1 = dst->src [1 ]; // index
1767
1770
1771
+ GGML_ASSERT (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1772
+
1768
1773
switch (src0->type ) {
1769
- case GGML_TYPE_F32: {
1770
- aclnn_index_select_4d (ctx, src0->data , src0->ne , src0->nb ,
1771
- dst->data , dst->ne , dst->nb ,
1772
- src1, dst->type );
1773
- break ;
1774
- }
1775
- case GGML_TYPE_F16: {
1776
- aclTensor* acl_src0 = ggml_cann_create_tensor (src0);
1777
- ggml_cann_pool_alloc src_buffer_allocator (
1778
- ctx.pool (), ggml_nelements (src0) * sizeof (float ));
1779
- void * src_trans_buffer = src_buffer_allocator.get ();
1780
- size_t src_trans_nb[GGML_MAX_DIMS];
1781
- src_trans_nb[0 ] = sizeof (float );
1782
- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
1783
- src_trans_nb[i] = src_trans_nb[i - 1 ] * src0->ne [i - 1 ];
1774
+ case GGML_TYPE_F16:
1775
+ case GGML_TYPE_F32:
1776
+ if (src0->type == dst->type ) {
1777
+ aclnn_index_select_4d (ctx, src0->data , src0->ne , src0->nb ,
1778
+ dst->data , dst->ne , dst->nb ,
1779
+ src1, dst->type );
1780
+ } else {
1781
+ aclTensor* acl_src0 = ggml_cann_create_tensor (src0);
1782
+ ggml_cann_pool_alloc src_buffer_allocator (
1783
+ ctx.pool (), ggml_nelements (src0) * ggml_element_size (dst));
1784
+ void * src_trans_buffer = src_buffer_allocator.get ();
1785
+ size_t src_trans_nb[GGML_MAX_DIMS];
1786
+ src_trans_nb[0 ] = dst->nb [0 ];
1787
+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
1788
+ src_trans_nb[i] = src_trans_nb[i - 1 ] * src0->ne [i - 1 ];
1789
+ }
1790
+ aclTensor* src_trans_tensor = ggml_cann_create_tensor (
1791
+ src_trans_buffer, ggml_cann_type_mapping (dst->type ), ggml_type_size (dst->type ),
1792
+ src0->ne , src_trans_nb, GGML_MAX_DIMS);
1793
+ aclnn_cast (ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping (dst->type ));
1794
+ aclnn_index_select_4d (ctx, src_trans_buffer, src0->ne , src_trans_nb,
1795
+ dst->data , dst->ne , dst->nb ,
1796
+ src1, dst->type );
1797
+ ggml_cann_release_resources (ctx, acl_src0, src_trans_tensor);
1784
1798
}
1785
- aclTensor* src_trans_tensor = ggml_cann_create_tensor (
1786
- src_trans_buffer, ACL_FLOAT, ggml_type_size (dst->type ),
1787
- src0->ne , src_trans_nb, GGML_MAX_DIMS);
1788
- aclnn_cast (ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping (dst->type ));
1789
- aclnn_index_select_4d (ctx, src_trans_buffer, src0->ne , src_trans_nb,
1790
- dst->data , dst->ne , dst->nb ,
1791
- src1, dst->type );
1792
- ggml_cann_release_resources (ctx, acl_src0, src_trans_tensor);
1793
1799
break ;
1794
- }
1795
1800
case GGML_TYPE_Q8_0: {
1796
1801
// add 1 dim for bcast mul.
1797
1802
size_t weight_nb[GGML_MAX_DIMS + 1 ], scale_nb[GGML_MAX_DIMS + 1 ],
1798
1803
dequant_nb[GGML_MAX_DIMS + 1 ];
1799
1804
int64_t weight_ne[GGML_MAX_DIMS + 1 ], scale_ne[GGML_MAX_DIMS + 1 ],
1800
1805
*dequant_ne;
1801
1806
int64_t scale_offset = 0 ;
1802
-
1803
1807
// [3,4,5,64] -> [3,4,5,2,32]
1804
1808
weight_ne[0 ] = QK8_0;
1805
1809
weight_ne[1 ] = src0->ne [0 ] / QK8_0;
@@ -1809,7 +1813,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1809
1813
weight_ne[i] = src0->ne [i - 1 ];
1810
1814
weight_nb[i] = weight_nb[i - 1 ] * weight_ne[i - 1 ];
1811
1815
}
1812
-
1813
1816
// [3,4,5,64] -> [3,4,5,2,1]
1814
1817
scale_ne[0 ] = 1 ;
1815
1818
scale_ne[1 ] = src0->ne [0 ] / QK8_0;
@@ -1819,35 +1822,30 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1819
1822
scale_ne[i] = src0->ne [i - 1 ];
1820
1823
scale_nb[i] = scale_nb[i - 1 ] * scale_ne[i - 1 ];
1821
1824
}
1822
-
1823
1825
// [3,4,5,64] -> [3,4,5,2,32]
1824
1826
dequant_ne = weight_ne;
1825
- dequant_nb[0 ] = sizeof ( float );
1827
+ dequant_nb[0 ] = ggml_type_size (dst-> type );
1826
1828
for (int i = 1 ; i < GGML_MAX_DIMS + 1 ; i++) {
1827
1829
dequant_nb[i] = dequant_nb[i - 1 ] * dequant_ne[i - 1 ];
1828
1830
}
1829
-
1830
1831
scale_offset = ggml_nelements (src0) * sizeof (int8_t );
1831
1832
ggml_cann_pool_alloc dequant_buffer_allocator (
1832
- ctx.pool (), ggml_nelements (src0) * sizeof (float ));
1833
-
1833
+ ctx.pool (), ggml_nelements (src0) * ggml_type_size (dst->type ));
1834
1834
aclTensor* acl_weight_tensor = ggml_cann_create_tensor (
1835
1835
src0->data , ACL_INT8, sizeof (int8_t ), weight_ne, weight_nb,
1836
1836
GGML_MAX_DIMS + 1 );
1837
1837
aclTensor* acl_scale_tensor = ggml_cann_create_tensor (
1838
1838
src0->data , ACL_FLOAT16, sizeof (uint16_t ), scale_ne, scale_nb,
1839
1839
GGML_MAX_DIMS + 1 , ACL_FORMAT_ND, scale_offset);
1840
1840
aclTensor* dequant_tensor = ggml_cann_create_tensor (
1841
- dequant_buffer_allocator.get (), ACL_FLOAT, sizeof ( float ),
1841
+ dequant_buffer_allocator.get (), ggml_cann_type_mapping (dst-> type ), ggml_type_size (dst-> type ),
1842
1842
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1 );
1843
-
1844
1843
aclnn_mul (ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
1845
- dequant_nb[0 ] = sizeof ( float );
1844
+ dequant_nb[0 ] = ggml_type_size (dst-> type );
1846
1845
dequant_ne = src0->ne ;
1847
1846
for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
1848
1847
dequant_nb[i] = dequant_nb[i - 1 ] * src0->ne [i - 1 ];
1849
1848
}
1850
-
1851
1849
aclnn_index_select_4d (ctx, dequant_buffer_allocator.get (),
1852
1850
dequant_ne, dequant_nb,
1853
1851
dst->data , dst->ne , dst->nb ,
@@ -1965,16 +1963,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1965
1963
// Only check env once.
1966
1964
static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" on" ));
1967
1965
if (weight_to_nz && is_matmul_weight (weight)) {
1968
- int64_t acl_stride[2 ] = {1 , transpose_ne[1 ]};
1969
-
1970
- // Reverse ne.
1971
- std::reverse (transpose_ne, transpose_ne + n_dims);
1972
-
1973
- std::vector<int64_t > storageDims = {transpose_ne[0 ], transpose_ne[1 ]};
1974
-
1975
- acl_weight_tensor = aclCreateTensor (
1976
- transpose_ne, n_dims, ggml_cann_type_mapping (weight->type ), acl_stride,
1977
- 0 , ACL_FORMAT_FRACTAL_NZ, storageDims.data (), 2 , weight->data );
1966
+ acl_weight_tensor =
1967
+ ggml_cann_create_tensor (weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
1978
1968
} else {
1979
1969
acl_weight_tensor =
1980
1970
ggml_cann_create_tensor (weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
@@ -3178,7 +3168,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3178
3168
aclTensor* acl_src0_f16_tensor = nullptr ;
3179
3169
aclTensor* acl_src1_f16_tensor = nullptr ;
3180
3170
aclTensor* acl_src2_f16_tensor = nullptr ;
3181
- aclTensor* acl_dst_f16_tensor = nullptr ;
3182
3171
3183
3172
// Step 1: cast the src0 (Query) to fp16 if needed
3184
3173
ggml_cann_pool_alloc src0_f16_allocator (ctx.pool ());
@@ -3216,22 +3205,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3216
3205
acl_src2_f16_tensor = ggml_cann_create_tensor (src2, src2_bsnd_ne,
3217
3206
src2_bsnd_nb, GGML_MAX_DIMS);
3218
3207
3219
- ggml_cann_pool_alloc out_f16_allocator (ctx.pool ());
3220
- void * out_f16_buffer = out_f16_allocator.alloc (
3221
- ggml_nelements (dst) * faElemSize);
3222
-
3223
- int64_t * out_f16_ne = src0_bsnd_ne;
3224
- size_t out_f16_nb[GGML_MAX_DIMS];
3225
- out_f16_nb[0 ] = faElemSize;
3226
- for (int i = 1 ; i < GGML_MAX_DIMS; ++i){
3227
- out_f16_nb[i] = out_f16_nb[i - 1 ] * out_f16_ne[i - 1 ];
3228
- }
3229
-
3230
- acl_dst_f16_tensor = ggml_cann_create_tensor (
3231
- out_f16_buffer, faDataType, faElemSize,
3232
- out_f16_ne, out_f16_nb, GGML_MAX_DIMS
3233
- );
3234
-
3235
3208
// Step 3: create the PSEShift tensor if needed
3236
3209
// this tensor is considered as mask (f16) in the llama.cpp
3237
3210
aclTensor* bcast_pse_tensor = nullptr ;
@@ -3334,8 +3307,29 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3334
3307
int64_t keyAntiquantMode = 0 ;
3335
3308
int64_t valueAntiquantMode = 0 ;
3336
3309
3337
- // Step 5: launch the FusedInferAttentionScoreV2 kernel.
3338
- // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
3310
+ GGML_ASSERT (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
3311
+ aclTensor * fa_dst_tensor = nullptr ;
3312
+ aclTensor * acl_dst_tensor = nullptr ;
3313
+ ggml_cann_pool_alloc out_f16_allocator (ctx.pool ());
3314
+ if (dst->type == GGML_TYPE_F32) {
3315
+ void * out_f16_buffer = out_f16_allocator.alloc (
3316
+ ggml_nelements (dst) * faElemSize);
3317
+
3318
+ int64_t * out_f16_ne = src0_bsnd_ne;
3319
+ size_t out_f16_nb[GGML_MAX_DIMS];
3320
+ out_f16_nb[0 ] = faElemSize;
3321
+ for (int i = 1 ; i < GGML_MAX_DIMS; ++i){
3322
+ out_f16_nb[i] = out_f16_nb[i - 1 ] * out_f16_ne[i - 1 ];
3323
+ }
3324
+
3325
+ fa_dst_tensor = ggml_cann_create_tensor (
3326
+ out_f16_buffer, faDataType, faElemSize,
3327
+ out_f16_ne, out_f16_nb, GGML_MAX_DIMS
3328
+ );
3329
+ }
3330
+ else {
3331
+ fa_dst_tensor = ggml_cann_create_tensor (dst);
3332
+ }
3339
3333
3340
3334
GGML_CANN_CALL_ACLNN_OP (ctx, FusedInferAttentionScoreV2,
3341
3335
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
@@ -3357,23 +3351,24 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
3357
3351
blockSize, antiquantMode, // blockSize, antiquantMode
3358
3352
softmaxLseFlag, // softmaxLseFlag
3359
3353
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
3360
- acl_dst_f16_tensor , // attentionOut
3354
+ fa_dst_tensor , // attentionOut
3361
3355
nullptr // softmaxLse
3362
3356
);
3363
3357
3364
- // Step 6: post-processing, permute and cast to f32
3365
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor (dst);
3366
- // TODO: when dst is fp16, don't need cast
3367
- aclnn_cast (ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping (dst->type ));
3368
- ggml_cann_release_resources (ctx, acl_src0_f16_tensor,
3369
- acl_src1_f16_tensor,
3370
- acl_src2_f16_tensor,
3371
- acl_dst_f16_tensor,
3372
- acl_dst_tensor);
3373
- if (src3 != nullptr ){
3374
- ggml_cann_release_resources (ctx, bcast_pse_tensor);
3358
+ if (dst->type == GGML_TYPE_F32) {
3359
+ // Step 6: post-processing, permute and cast to f32
3360
+ aclTensor* acl_dst_tensor = ggml_cann_create_tensor (dst);
3361
+ aclnn_cast (ctx, fa_dst_tensor, acl_dst_tensor, ggml_cann_type_mapping (dst->type ));
3375
3362
}
3376
- }else {
3363
+
3364
+ ggml_cann_release_resources (ctx, acl_src0_f16_tensor,
3365
+ acl_src1_f16_tensor,
3366
+ acl_src2_f16_tensor,
3367
+ fa_dst_tensor,
3368
+ acl_dst_tensor,
3369
+ bcast_pse_tensor);
3370
+
3371
+ } else {
3377
3372
GGML_ABORT (" Function is not implemented." );
3378
3373
}
3379
3374
}
0 commit comments