@@ -6,6 +6,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
66 BLASLONG gvl = 0 ;
77 BLASLONG m_top = 0 ;
88 BLASLONG n_top = 0 ;
9+ __bf16 * BB = (__bf16 * )(B );
10+ __bf16 * AA = (__bf16 * )(A );
911
1012 // -- MAIN PASS
1113 for (BLASLONG j = 0 ; j < N /8 ; j += 1 ) {
@@ -26,17 +28,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
2628 vfloat32m2_t result7 = __riscv_vfmv_v_f_f32m2 (0.0f , gvl );
2729
2830 for (BLASLONG k = 0 ; k < K ; k ++ ) {
29- __bf16 B0 = B [bi + 0 ];
30- __bf16 B1 = B [bi + 1 ];
31- __bf16 B2 = B [bi + 2 ];
32- __bf16 B3 = B [bi + 3 ];
33- __bf16 B4 = B [bi + 4 ];
34- __bf16 B5 = B [bi + 5 ];
35- __bf16 B6 = B [bi + 6 ];
36- __bf16 B7 = B [bi + 7 ];
31+ __bf16 B0 = BB [bi + 0 ];
32+ __bf16 B1 = BB [bi + 1 ];
33+ __bf16 B2 = BB [bi + 2 ];
34+ __bf16 B3 = BB [bi + 3 ];
35+ __bf16 B4 = BB [bi + 4 ];
36+ __bf16 B5 = BB [bi + 5 ];
37+ __bf16 B6 = BB [bi + 6 ];
38+ __bf16 B7 = BB [bi + 7 ];
3739 bi += 8 ;
3840
39- vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & A [ai + 0 * gvl ], gvl );
41+ vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & AA [ai + 0 * gvl ], gvl );
4042 ai += 16 ;
4143
4244 result0 = __riscv_vfwmaccbf16_vf_f32m2 (result0 , B0 , A0 , gvl );
@@ -100,17 +102,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
100102 vfloat32m1_t result7 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
101103
102104 for (BLASLONG k = 0 ; k < K ; k ++ ) {
103- __bf16 B0 = B [bi + 0 ];
104- __bf16 B1 = B [bi + 1 ];
105- __bf16 B2 = B [bi + 2 ];
106- __bf16 B3 = B [bi + 3 ];
107- __bf16 B4 = B [bi + 4 ];
108- __bf16 B5 = B [bi + 5 ];
109- __bf16 B6 = B [bi + 6 ];
110- __bf16 B7 = B [bi + 7 ];
105+ __bf16 B0 = BB [bi + 0 ];
106+ __bf16 B1 = BB [bi + 1 ];
107+ __bf16 B2 = BB [bi + 2 ];
108+ __bf16 B3 = BB [bi + 3 ];
109+ __bf16 B4 = BB [bi + 4 ];
110+ __bf16 B5 = BB [bi + 5 ];
111+ __bf16 B6 = BB [bi + 6 ];
112+ __bf16 B7 = BB [bi + 7 ];
111113 bi += 8 ;
112114
113- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
115+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
114116 ai += 8 ;
115117
116118 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -172,17 +174,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
172174 vfloat32m1_t result7 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
173175
174176 for (BLASLONG k = 0 ; k < K ; ++ k ) {
175- __bf16 B0 = B [bi + 0 ];
176- __bf16 B1 = B [bi + 1 ];
177- __bf16 B2 = B [bi + 2 ];
178- __bf16 B3 = B [bi + 3 ];
179- __bf16 B4 = B [bi + 4 ];
180- __bf16 B5 = B [bi + 5 ];
181- __bf16 B6 = B [bi + 6 ];
182- __bf16 B7 = B [bi + 7 ];
177+ __bf16 B0 = BB [bi + 0 ];
178+ __bf16 B1 = BB [bi + 1 ];
179+ __bf16 B2 = BB [bi + 2 ];
180+ __bf16 B3 = BB [bi + 3 ];
181+ __bf16 B4 = BB [bi + 4 ];
182+ __bf16 B5 = BB [bi + 5 ];
183+ __bf16 B6 = BB [bi + 6 ];
184+ __bf16 B7 = BB [bi + 7 ];
183185 bi += 8 ;
184186
185- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
187+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
186188 ai += 4 ;
187189
188190 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -256,22 +258,22 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
256258 BLASLONG bi = n_top * K ;
257259
258260 for (BLASLONG k = 0 ; k < K ; k ++ ) {
259- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
260- result1 += (float )(A [ai + 1 ])* (float )(B [bi + 0 ]);
261- result2 += (float )(A [ai + 0 ])* (float )(B [bi + 1 ]);
262- result3 += (float )(A [ai + 1 ])* (float )(B [bi + 1 ]);
263- result4 += (float )(A [ai + 0 ])* (float )(B [bi + 2 ]);
264- result5 += (float )(A [ai + 1 ])* (float )(B [bi + 2 ]);
265- result6 += (float )(A [ai + 0 ])* (float )(B [bi + 3 ]);
266- result7 += (float )(A [ai + 1 ])* (float )(B [bi + 3 ]);
267- result8 += (float )(A [ai + 0 ])* (float )(B [bi + 4 ]);
268- result9 += (float )(A [ai + 1 ])* (float )(B [bi + 4 ]);
269- result10 += (float )(A [ai + 0 ])* (float )(B [bi + 5 ]);
270- result11 += (float )(A [ai + 1 ])* (float )(B [bi + 5 ]);
271- result12 += (float )(A [ai + 0 ])* (float )(B [bi + 6 ]);
272- result13 += (float )(A [ai + 1 ])* (float )(B [bi + 6 ]);
273- result14 += (float )(A [ai + 0 ])* (float )(B [bi + 7 ]);
274- result15 += (float )(A [ai + 1 ])* (float )(B [bi + 7 ]);
261+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
262+ result1 += (float )(AA [ai + 1 ])* (float )(BB [bi + 0 ]);
263+ result2 += (float )(AA [ai + 0 ])* (float )(BB [bi + 1 ]);
264+ result3 += (float )(AA [ai + 1 ])* (float )(BB [bi + 1 ]);
265+ result4 += (float )(AA [ai + 0 ])* (float )(BB [bi + 2 ]);
266+ result5 += (float )(AA [ai + 1 ])* (float )(BB [bi + 2 ]);
267+ result6 += (float )(AA [ai + 0 ])* (float )(BB [bi + 3 ]);
268+ result7 += (float )(AA [ai + 1 ])* (float )(BB [bi + 3 ]);
269+ result8 += (float )(AA [ai + 0 ])* (float )(BB [bi + 4 ]);
270+ result9 += (float )(AA [ai + 1 ])* (float )(BB [bi + 4 ]);
271+ result10 += (float )(AA [ai + 0 ])* (float )(BB [bi + 5 ]);
272+ result11 += (float )(AA [ai + 1 ])* (float )(BB [bi + 5 ]);
273+ result12 += (float )(AA [ai + 0 ])* (float )(BB [bi + 6 ]);
274+ result13 += (float )(AA [ai + 1 ])* (float )(BB [bi + 6 ]);
275+ result14 += (float )(AA [ai + 0 ])* (float )(BB [bi + 7 ]);
276+ result15 += (float )(AA [ai + 1 ])* (float )(BB [bi + 7 ]);
275277 ai += 2 ;
276278 bi += 8 ;
277279 }
@@ -314,14 +316,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
314316 BLASLONG bi = n_top * K ;
315317
316318 for (BLASLONG k = 0 ; k < K ; k ++ ) {
317- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
318- result1 += (float )(A [ai + 0 ])* (float )(B [bi + 1 ]);
319- result2 += (float )(A [ai + 0 ])* (float )(B [bi + 2 ]);
320- result3 += (float )(A [ai + 0 ])* (float )(B [bi + 3 ]);
321- result4 += (float )(A [ai + 0 ])* (float )(B [bi + 4 ]);
322- result5 += (float )(A [ai + 0 ])* (float )(B [bi + 5 ]);
323- result6 += (float )(A [ai + 0 ])* (float )(B [bi + 6 ]);
324- result7 += (float )(A [ai + 0 ])* (float )(B [bi + 7 ]);
319+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
320+ result1 += (float )(AA [ai + 0 ])* (float )(BB [bi + 1 ]);
321+ result2 += (float )(AA [ai + 0 ])* (float )(BB [bi + 2 ]);
322+ result3 += (float )(AA [ai + 0 ])* (float )(BB [bi + 3 ]);
323+ result4 += (float )(AA [ai + 0 ])* (float )(BB [bi + 4 ]);
324+ result5 += (float )(AA [ai + 0 ])* (float )(BB [bi + 5 ]);
325+ result6 += (float )(AA [ai + 0 ])* (float )(BB [bi + 6 ]);
326+ result7 += (float )(AA [ai + 0 ])* (float )(BB [bi + 7 ]);
325327 ai += 1 ;
326328 bi += 8 ;
327329 }
@@ -354,13 +356,13 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
354356 vfloat32m2_t result3 = __riscv_vfmv_v_f_f32m2 (0.0f , gvl );
355357
356358 for (BLASLONG k = 0 ; k < K ; k ++ ) {
357- __bf16 B0 = B [bi + 0 ];
358- __bf16 B1 = B [bi + 1 ];
359- __bf16 B2 = B [bi + 2 ];
360- __bf16 B3 = B [bi + 3 ];
359+ __bf16 B0 = BB [bi + 0 ];
360+ __bf16 B1 = BB [bi + 1 ];
361+ __bf16 B2 = BB [bi + 2 ];
362+ __bf16 B3 = BB [bi + 3 ];
361363 bi += 4 ;
362364
363- vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & A [ai + 0 * gvl ], gvl );
365+ vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & AA [ai + 0 * gvl ], gvl );
364366 ai += 16 ;
365367
366368 result0 = __riscv_vfwmaccbf16_vf_f32m2 (result0 , B0 , A0 , gvl );
@@ -401,13 +403,13 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
401403 vfloat32m1_t result3 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
402404
403405 for (BLASLONG k = 0 ; k < K ; k ++ ) {
404- __bf16 B0 = B [bi + 0 ];
405- __bf16 B1 = B [bi + 1 ];
406- __bf16 B2 = B [bi + 2 ];
407- __bf16 B3 = B [bi + 3 ];
406+ __bf16 B0 = BB [bi + 0 ];
407+ __bf16 B1 = BB [bi + 1 ];
408+ __bf16 B2 = BB [bi + 2 ];
409+ __bf16 B3 = BB [bi + 3 ];
408410 bi += 4 ;
409411
410- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
412+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
411413 ai += 8 ;
412414
413415 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -449,13 +451,13 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
449451 vfloat32m1_t result3 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
450452
451453 for (BLASLONG k = 0 ; k < K ; ++ k ) {
452- __bf16 B0 = B [bi + 0 ];
453- __bf16 B1 = B [bi + 1 ];
454- __bf16 B2 = B [bi + 2 ];
455- __bf16 B3 = B [bi + 3 ];
454+ __bf16 B0 = BB [bi + 0 ];
455+ __bf16 B1 = BB [bi + 1 ];
456+ __bf16 B2 = BB [bi + 2 ];
457+ __bf16 B3 = BB [bi + 3 ];
456458 bi += 4 ;
457459
458- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
460+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
459461 ai += 4 ;
460462
461463 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -501,14 +503,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
501503 BLASLONG bi = n_top * K ;
502504
503505 for (BLASLONG k = 0 ; k < K ; k ++ ) {
504- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
505- result1 += (float )(A [ai + 1 ])* (float )(B [bi + 0 ]);
506- result2 += (float )(A [ai + 0 ])* (float )(B [bi + 1 ]);
507- result3 += (float )(A [ai + 1 ])* (float )(B [bi + 1 ]);
508- result4 += (float )(A [ai + 0 ])* (float )(B [bi + 2 ]);
509- result5 += (float )(A [ai + 1 ])* (float )(B [bi + 2 ]);
510- result6 += (float )(A [ai + 0 ])* (float )(B [bi + 3 ]);
511- result7 += (float )(A [ai + 1 ])* (float )(B [bi + 3 ]);
506+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
507+ result1 += (float )(AA [ai + 1 ])* (float )(BB [bi + 0 ]);
508+ result2 += (float )(AA [ai + 0 ])* (float )(BB [bi + 1 ]);
509+ result3 += (float )(AA [ai + 1 ])* (float )(BB [bi + 1 ]);
510+ result4 += (float )(AA [ai + 0 ])* (float )(BB [bi + 2 ]);
511+ result5 += (float )(AA [ai + 1 ])* (float )(BB [bi + 2 ]);
512+ result6 += (float )(AA [ai + 0 ])* (float )(BB [bi + 3 ]);
513+ result7 += (float )(AA [ai + 1 ])* (float )(BB [bi + 3 ]);
512514 ai += 2 ;
513515 bi += 4 ;
514516 }
@@ -537,10 +539,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
537539 BLASLONG bi = n_top * K ;
538540
539541 for (BLASLONG k = 0 ; k < K ; k ++ ) {
540- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
541- result1 += (float )(A [ai + 0 ])* (float )(B [bi + 1 ]);
542- result2 += (float )(A [ai + 0 ])* (float )(B [bi + 2 ]);
543- result3 += (float )(A [ai + 0 ])* (float )(B [bi + 3 ]);
542+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
543+ result1 += (float )(AA [ai + 0 ])* (float )(BB [bi + 1 ]);
544+ result2 += (float )(AA [ai + 0 ])* (float )(BB [bi + 2 ]);
545+ result3 += (float )(AA [ai + 0 ])* (float )(BB [bi + 3 ]);
544546 ai += 1 ;
545547 bi += 4 ;
546548 }
@@ -569,11 +571,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
569571 vfloat32m2_t result1 = __riscv_vfmv_v_f_f32m2 (0.0f , gvl );
570572
571573 for (BLASLONG k = 0 ; k < K ; k ++ ) {
572- __bf16 B0 = B [bi + 0 ];
573- __bf16 B1 = B [bi + 1 ];
574+ __bf16 B0 = BB [bi + 0 ];
575+ __bf16 B1 = BB [bi + 1 ];
574576 bi += 2 ;
575577
576- vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & A [ai + 0 * gvl ], gvl );
578+ vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & AA [ai + 0 * gvl ], gvl );
577579 ai += 16 ;
578580
579581 result0 = __riscv_vfwmaccbf16_vf_f32m2 (result0 , B0 , A0 , gvl );
@@ -603,11 +605,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
603605 vfloat32m1_t result1 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
604606
605607 for (BLASLONG k = 0 ; k < K ; k ++ ) {
606- __bf16 B0 = B [bi + 0 ];
607- __bf16 B1 = B [bi + 1 ];
608+ __bf16 B0 = BB [bi + 0 ];
609+ __bf16 B1 = BB [bi + 1 ];
608610 bi += 2 ;
609611
610- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
612+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
611613 ai += 8 ;
612614
613615 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -639,11 +641,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
639641 vfloat32m1_t result1 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
640642
641643 for (BLASLONG k = 0 ; k < K ; ++ k ) {
642- __bf16 B0 = B [bi + 0 ];
643- __bf16 B1 = B [bi + 1 ];
644+ __bf16 B0 = BB [bi + 0 ];
645+ __bf16 B1 = BB [bi + 1 ];
644646 bi += 2 ;
645647
646- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
648+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
647649 ai += 4 ;
648650
649651 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -675,10 +677,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
675677 BLASLONG bi = n_top * K ;
676678
677679 for (BLASLONG k = 0 ; k < K ; k ++ ) {
678- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
679- result1 += (float )(A [ai + 1 ])* (float )(B [bi + 0 ]);
680- result2 += (float )(A [ai + 0 ])* (float )(B [bi + 1 ]);
681- result3 += (float )(A [ai + 1 ])* (float )(B [bi + 1 ]);
680+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
681+ result1 += (float )(AA [ai + 1 ])* (float )(BB [bi + 0 ]);
682+ result2 += (float )(AA [ai + 0 ])* (float )(BB [bi + 1 ]);
683+ result3 += (float )(AA [ai + 1 ])* (float )(BB [bi + 1 ]);
682684 ai += 2 ;
683685 bi += 2 ;
684686 }
@@ -701,8 +703,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
701703 BLASLONG bi = n_top * K ;
702704
703705 for (BLASLONG k = 0 ; k < K ; k ++ ) {
704- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
705- result1 += (float )(A [ai + 0 ])* (float )(B [bi + 1 ]);
706+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
707+ result1 += (float )(AA [ai + 0 ])* (float )(BB [bi + 1 ]);
706708 ai += 1 ;
707709 bi += 2 ;
708710 }
@@ -728,10 +730,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
728730 vfloat32m2_t result0 = __riscv_vfmv_v_f_f32m2 (0.0f , gvl );
729731
730732 for (BLASLONG k = 0 ; k < K ; k ++ ) {
731- __bf16 B0 = B [bi + 0 ];
733+ __bf16 B0 = BB [bi + 0 ];
732734 bi += 1 ;
733735
734- vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & A [ai + 0 * gvl ], gvl );
736+ vbfloat16m1_t A0 = __riscv_vle16_v_bf16m1 ( & AA [ai + 0 * gvl ], gvl );
735737 ai += 16 ;
736738
737739 result0 = __riscv_vfwmaccbf16_vf_f32m2 (result0 , B0 , A0 , gvl );
@@ -757,10 +759,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
757759 vfloat32m1_t result0 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
758760
759761 for (BLASLONG k = 0 ; k < K ; k ++ ) {
760- __bf16 B0 = B [bi + 0 ];
762+ __bf16 B0 = BB [bi + 0 ];
761763 bi += 1 ;
762764
763- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
765+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
764766 ai += 8 ;
765767
766768 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -787,10 +789,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
787789 vfloat32m1_t result0 = __riscv_vfmv_v_f_f32m1 (0.0f , gvl );
788790
789791 for (BLASLONG k = 0 ; k < K ; ++ k ) {
790- __bf16 B0 = B [bi + 0 ];
792+ __bf16 B0 = BB [bi + 0 ];
791793 bi += 1 ;
792794
793- vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & A [ai + 0 * gvl ], gvl );
795+ vbfloat16mf2_t A0 = __riscv_vle16_v_bf16mf2 ( & AA [ai + 0 * gvl ], gvl );
794796 ai += 4 ;
795797
796798 result0 = __riscv_vfwmaccbf16_vf_f32m1 (result0 , B0 , A0 , gvl );
@@ -814,8 +816,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
814816 BLASLONG bi = n_top * K ;
815817
816818 for (BLASLONG k = 0 ; k < K ; k ++ ) {
817- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
818- result1 += (float )(A [ai + 1 ])* (float )(B [bi + 0 ]);
819+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
820+ result1 += (float )(AA [ai + 1 ])* (float )(BB [bi + 0 ]);
819821 ai += 2 ;
820822 bi += 1 ;
821823 }
@@ -835,7 +837,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
835837 BLASLONG bi = n_top * K ;
836838
837839 for (BLASLONG k = 0 ; k < K ; k ++ ) {
838- result0 += (float )(A [ai + 0 ])* (float )(B [bi + 0 ]);
840+ result0 += (float )(AA [ai + 0 ])* (float )(BB [bi + 0 ]);
839841 ai += 1 ;
840842 bi += 1 ;
841843 }
0 commit comments