@@ -400,10 +400,10 @@ struct ggml_backend_opencl_context {
400
400
cl_program program_mul_mm_f32_f32_l4_lm;
401
401
cl_program program_mul_mm_f16_f32_l4_lm;
402
402
403
- cl_kernel kernel_add, kernel_add_row;
404
- cl_kernel kernel_mul, kernel_mul_row;
405
- cl_kernel kernel_div, kernel_div_row;
406
- cl_kernel kernel_sub, kernel_sub_row;
403
+ cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16 ;
404
+ cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16 ;
405
+ cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16 ;
406
+ cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16 ;
407
407
cl_kernel kernel_scale;
408
408
cl_kernel kernel_silu, kernel_silu_4;
409
409
cl_kernel kernel_gelu, kernel_gelu_4;
@@ -674,8 +674,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
674
674
backend_ctx->program_add =
675
675
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
676
676
677
- CL_CHECK ((backend_ctx->kernel_add = clCreateKernel (backend_ctx->program_add , " kernel_add" , &err), err));
678
- CL_CHECK ((backend_ctx->kernel_add_row = clCreateKernel (backend_ctx->program_add , " kernel_add_row" , &err), err));
677
+ CL_CHECK ((backend_ctx->kernel_add = clCreateKernel (backend_ctx->program_add , " kernel_add" , &err), err));
678
+ CL_CHECK ((backend_ctx->kernel_add_row = clCreateKernel (backend_ctx->program_add , " kernel_add_row" , &err), err));
679
+ CL_CHECK ((backend_ctx->kernel_add_f16 = clCreateKernel (backend_ctx->program_add , " kernel_add_f16" , &err), err));
680
+ CL_CHECK ((backend_ctx->kernel_add_row_f16 = clCreateKernel (backend_ctx->program_add , " kernel_add_row_f16" , &err), err));
679
681
GGML_LOG_CONT (" ." );
680
682
}
681
683
@@ -1089,8 +1091,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1089
1091
backend_ctx->program_mul =
1090
1092
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1091
1093
1092
- CL_CHECK ((backend_ctx->kernel_mul = clCreateKernel (backend_ctx->program_mul , " kernel_mul" , &err), err));
1093
- CL_CHECK ((backend_ctx->kernel_mul_row = clCreateKernel (backend_ctx->program_mul , " kernel_mul_row" , &err), err));
1094
+ CL_CHECK ((backend_ctx->kernel_mul = clCreateKernel (backend_ctx->program_mul , " kernel_mul" , &err), err));
1095
+ CL_CHECK ((backend_ctx->kernel_mul_row = clCreateKernel (backend_ctx->program_mul , " kernel_mul_row" , &err), err));
1096
+ CL_CHECK ((backend_ctx->kernel_mul_f16 = clCreateKernel (backend_ctx->program_mul , " kernel_mul_f16" , &err), err));
1097
+ CL_CHECK ((backend_ctx->kernel_mul_row_f16 = clCreateKernel (backend_ctx->program_mul , " kernel_mul_row_f16" , &err), err));
1094
1098
GGML_LOG_CONT (" ." );
1095
1099
}
1096
1100
@@ -1288,11 +1292,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1288
1292
#else
1289
1293
const std::string kernel_src = read_file (" div.cl" );
1290
1294
#endif
1295
+ std::string compile_opts = std::string (" -cl-std=" ) + opencl_c_std +
1296
+ " -cl-mad-enable -cl-finite-math-only " ;
1297
+
1291
1298
backend_ctx->program_div =
1292
1299
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1293
1300
1294
- CL_CHECK ((backend_ctx->kernel_div = clCreateKernel (backend_ctx->program_div , " kernel_div" , &err), err));
1295
- CL_CHECK ((backend_ctx->kernel_div_row = clCreateKernel (backend_ctx->program_div , " kernel_div_row" , &err), err));
1301
+ CL_CHECK ((backend_ctx->kernel_div = clCreateKernel (backend_ctx->program_div , " kernel_div" , &err), err));
1302
+ CL_CHECK ((backend_ctx->kernel_div_row = clCreateKernel (backend_ctx->program_div , " kernel_div_row" , &err), err));
1303
+ CL_CHECK ((backend_ctx->kernel_div_f16 = clCreateKernel (backend_ctx->program_div , " kernel_div_f16" , &err), err));
1304
+ CL_CHECK ((backend_ctx->kernel_div_row_f16 = clCreateKernel (backend_ctx->program_div , " kernel_div_row_f16" , &err), err));
1296
1305
GGML_LOG_CONT (" ." );
1297
1306
}
1298
1307
@@ -1308,8 +1317,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1308
1317
backend_ctx->program_sub =
1309
1318
build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1310
1319
1311
- CL_CHECK ((backend_ctx->kernel_sub = clCreateKernel (backend_ctx->program_sub , " kernel_sub" , &err), err));
1312
- CL_CHECK ((backend_ctx->kernel_sub_row = clCreateKernel (backend_ctx->program_sub , " kernel_sub_row" , &err), err));
1320
+ CL_CHECK ((backend_ctx->kernel_sub = clCreateKernel (backend_ctx->program_sub , " kernel_sub" , &err), err));
1321
+ CL_CHECK ((backend_ctx->kernel_sub_row = clCreateKernel (backend_ctx->program_sub , " kernel_sub_row" , &err), err));
1322
+ CL_CHECK ((backend_ctx->kernel_sub_f16 = clCreateKernel (backend_ctx->program_sub , " kernel_sub_f16" , &err), err));
1323
+ CL_CHECK ((backend_ctx->kernel_sub_row_f16 = clCreateKernel (backend_ctx->program_sub , " kernel_sub_row_f16" , &err), err));
1313
1324
GGML_LOG_CONT (" ." );
1314
1325
}
1315
1326
@@ -2447,12 +2458,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2447
2458
default :
2448
2459
return false ;
2449
2460
}
2450
- case GGML_OP_ADD:
2451
2461
case GGML_OP_SCALE:
2462
+ return op->src [0 ]->type == GGML_TYPE_F32 && ggml_is_contiguous (op->src [0 ]);
2463
+ case GGML_OP_ADD:
2452
2464
case GGML_OP_MUL:
2453
2465
case GGML_OP_DIV:
2454
2466
case GGML_OP_SUB:
2455
- return op->src [0 ]->type == GGML_TYPE_F32;
2467
+ return (op->src [0 ]->type == op->src [1 ]->type ) &&
2468
+ (op->src [0 ]->type == op->type ) &&
2469
+ (op->src [0 ]->type == GGML_TYPE_F32 || op->src [0 ]->type == GGML_TYPE_F16);
2456
2470
case GGML_OP_UNARY:
2457
2471
switch (ggml_get_unary_op (op)) {
2458
2472
case GGML_UNARY_OP_GELU:
@@ -3680,35 +3694,39 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3680
3694
GGML_ASSERT (dst);
3681
3695
GGML_ASSERT (dst->extra );
3682
3696
3683
- const int ne00 = src0 ? src0->ne [0 ] : 0 ;
3684
- const int ne01 = src0 ? src0->ne [1 ] : 0 ;
3685
- const int ne02 = src0 ? src0->ne [2 ] : 0 ;
3686
- const int ne03 = src0 ? src0->ne [3 ] : 0 ;
3697
+ GGML_ASSERT (src0->type == src1->type );
3698
+ GGML_ASSERT (src0->type == dst->type );
3699
+ GGML_ASSERT (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
3687
3700
3688
- const cl_ulong nb00 = src0 ? src0-> nb [0 ] : 0 ;
3689
- const cl_ulong nb01 = src0 ? src0-> nb [1 ] : 0 ;
3690
- const cl_ulong nb02 = src0 ? src0-> nb [2 ] : 0 ;
3691
- const cl_ulong nb03 = src0 ? src0-> nb [3 ] : 0 ;
3701
+ const int ne00 = src0-> ne [0 ];
3702
+ const int ne01 = src0-> ne [1 ];
3703
+ const int ne02 = src0-> ne [2 ];
3704
+ const int ne03 = src0-> ne [3 ];
3692
3705
3693
- const int ne10 = src1 ? src1-> ne [0 ] : 0 ;
3694
- const int ne11 = src1 ? src1-> ne [1 ] : 0 ;
3695
- const int ne12 = src1 ? src1-> ne [2 ] : 0 ;
3696
- const int ne13 = src1 ? src1-> ne [3 ] : 0 ; UNUSED (ne13) ;
3706
+ const cl_ulong nb00 = src0-> nb [0 ];
3707
+ const cl_ulong nb01 = src0-> nb [1 ];
3708
+ const cl_ulong nb02 = src0-> nb [2 ];
3709
+ const cl_ulong nb03 = src0-> nb [3 ];
3697
3710
3698
- const cl_ulong nb10 = src1 ? src1-> nb [0 ] : 0 ;
3699
- const cl_ulong nb11 = src1 ? src1-> nb [1 ] : 0 ;
3700
- const cl_ulong nb12 = src1 ? src1-> nb [2 ] : 0 ;
3701
- const cl_ulong nb13 = src1 ? src1-> nb [3 ] : 0 ; UNUSED (nb13 );
3711
+ const int ne10 = src1-> ne [0 ];
3712
+ const int ne11 = src1-> ne [1 ];
3713
+ const int ne12 = src1-> ne [2 ];
3714
+ const int ne13 = src1-> ne [3 ]; UNUSED (ne13 );
3702
3715
3703
- const int ne0 = dst ? dst-> ne [0 ] : 0 ;
3704
- const int ne1 = dst ? dst-> ne [1 ] : 0 ;
3705
- const int ne2 = dst ? dst-> ne [2 ] : 0 ;
3706
- const int ne3 = dst ? dst-> ne [3 ] : 0 ;
3716
+ const cl_ulong nb10 = src1-> nb [0 ];
3717
+ const cl_ulong nb11 = src1-> nb [1 ];
3718
+ const cl_ulong nb12 = src1-> nb [2 ];
3719
+ const cl_ulong nb13 = src1-> nb [3 ]; UNUSED (nb13) ;
3707
3720
3708
- const cl_ulong nb0 = dst ? dst->nb [0 ] : 0 ;
3709
- const cl_ulong nb1 = dst ? dst->nb [1 ] : 0 ;
3710
- const cl_ulong nb2 = dst ? dst->nb [2 ] : 0 ;
3711
- const cl_ulong nb3 = dst ? dst->nb [3 ] : 0 ;
3721
+ const int ne0 = dst->ne [0 ];
3722
+ const int ne1 = dst->ne [1 ];
3723
+ const int ne2 = dst->ne [2 ];
3724
+ const int ne3 = dst->ne [3 ];
3725
+
3726
+ const cl_ulong nb0 = dst->nb [0 ];
3727
+ const cl_ulong nb1 = dst->nb [1 ];
3728
+ const cl_ulong nb2 = dst->nb [2 ];
3729
+ const cl_ulong nb3 = dst->nb [3 ];
3712
3730
3713
3731
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
3714
3732
@@ -3731,7 +3749,12 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3731
3749
3732
3750
bcast_row = true ;
3733
3751
int ne = ne00 / 4 ;
3734
- kernel = backend_ctx->kernel_add_row ;
3752
+
3753
+ if (src0->type == GGML_TYPE_F32) {
3754
+ kernel = backend_ctx->kernel_add_row ;
3755
+ } else {
3756
+ kernel = backend_ctx->kernel_add_row_f16 ;
3757
+ }
3735
3758
3736
3759
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3737
3760
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
@@ -3741,7 +3764,11 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
3741
3764
CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
3742
3765
CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne));
3743
3766
} else {
3744
- kernel = backend_ctx->kernel_add ;
3767
+ if (src0->type == GGML_TYPE_F32) {
3768
+ kernel = backend_ctx->kernel_add ;
3769
+ } else {
3770
+ kernel = backend_ctx->kernel_add_f16 ;
3771
+ }
3745
3772
3746
3773
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3747
3774
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
@@ -3803,35 +3830,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3803
3830
GGML_ASSERT (dst);
3804
3831
GGML_ASSERT (dst->extra );
3805
3832
3806
- const int ne00 = src0 ? src0->ne [0 ] : 0 ;
3807
- const int ne01 = src0 ? src0->ne [1 ] : 0 ;
3808
- const int ne02 = src0 ? src0->ne [2 ] : 0 ;
3809
- const int ne03 = src0 ? src0->ne [3 ] : 0 ;
3833
+ GGML_ASSERT (src0->type == src1->type );
3834
+ GGML_ASSERT (src0->type == dst->type );
3835
+ GGML_ASSERT (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
3810
3836
3811
- const cl_ulong nb00 = src0 ? src0-> nb [0 ] : 0 ;
3812
- const cl_ulong nb01 = src0 ? src0-> nb [1 ] : 0 ;
3813
- const cl_ulong nb02 = src0 ? src0-> nb [2 ] : 0 ;
3814
- const cl_ulong nb03 = src0 ? src0-> nb [3 ] : 0 ;
3837
+ const int ne00 = src0-> ne [0 ];
3838
+ const int ne01 = src0-> ne [1 ];
3839
+ const int ne02 = src0-> ne [2 ];
3840
+ const int ne03 = src0-> ne [3 ];
3815
3841
3816
- const int ne10 = src1 ? src1-> ne [0 ] : 0 ;
3817
- const int ne11 = src1 ? src1-> ne [1 ] : 0 ;
3818
- const int ne12 = src1 ? src1-> ne [2 ] : 0 ;
3819
- const int ne13 = src1 ? src1-> ne [3 ] : 0 ; UNUSED (ne13) ;
3842
+ const cl_ulong nb00 = src0-> nb [0 ];
3843
+ const cl_ulong nb01 = src0-> nb [1 ];
3844
+ const cl_ulong nb02 = src0-> nb [2 ];
3845
+ const cl_ulong nb03 = src0-> nb [3 ];
3820
3846
3821
- const cl_ulong nb10 = src1 ? src1->nb [0 ] : 0 ;
3822
- const cl_ulong nb11 = src1 ? src1->nb [1 ] : 0 ;
3823
- const cl_ulong nb12 = src1 ? src1->nb [2 ] : 0 ;
3824
- const cl_ulong nb13 = src1 ? src1->nb [3 ] : 0 ; UNUSED (nb13);
3847
+ const int ne10 = src1->ne [0 ];
3848
+ const int ne11 = src1->ne [1 ];
3849
+ const int ne12 = src1->ne [2 ];
3850
+ const int ne13 = src1->ne [3 ]; UNUSED (ne13);
3851
+
3852
+ const cl_ulong nb10 = src1->nb [0 ];
3853
+ const cl_ulong nb11 = src1->nb [1 ];
3854
+ const cl_ulong nb12 = src1->nb [2 ];
3855
+ const cl_ulong nb13 = src1->nb [3 ]; UNUSED (nb13);
3825
3856
3826
- const int ne0 = dst ? dst ->ne [0 ] : 0 ;
3827
- const int ne1 = dst ? dst ->ne [1 ] : 0 ;
3828
- const int ne2 = dst ? dst ->ne [2 ] : 0 ;
3829
- const int ne3 = dst ? dst ->ne [3 ] : 0 ;
3857
+ const int ne0 = dst->ne [0 ];
3858
+ const int ne1 = dst->ne [1 ];
3859
+ const int ne2 = dst->ne [2 ];
3860
+ const int ne3 = dst->ne [3 ];
3830
3861
3831
- const cl_ulong nb0 = dst ? dst ->nb [0 ] : 0 ;
3832
- const cl_ulong nb1 = dst ? dst ->nb [1 ] : 0 ;
3833
- const cl_ulong nb2 = dst ? dst ->nb [2 ] : 0 ;
3834
- const cl_ulong nb3 = dst ? dst ->nb [3 ] : 0 ;
3862
+ const cl_ulong nb0 = dst->nb [0 ];
3863
+ const cl_ulong nb1 = dst->nb [1 ];
3864
+ const cl_ulong nb2 = dst->nb [2 ];
3865
+ const cl_ulong nb3 = dst->nb [3 ];
3835
3866
3836
3867
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
3837
3868
@@ -3854,7 +3885,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3854
3885
3855
3886
bcast_row = true ;
3856
3887
int ne = ne00 / 4 ;
3857
- kernel = backend_ctx->kernel_mul_row ;
3888
+
3889
+ if (src0->type == GGML_TYPE_F32) {
3890
+ kernel = backend_ctx->kernel_mul_row ;
3891
+ } else {
3892
+ kernel = backend_ctx->kernel_mul_row_f16 ;
3893
+ }
3858
3894
3859
3895
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3860
3896
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
@@ -3864,7 +3900,11 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3864
3900
CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
3865
3901
CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne));
3866
3902
} else {
3867
- kernel = backend_ctx->kernel_mul ;
3903
+ if (src0->type == GGML_TYPE_F32) {
3904
+ kernel = backend_ctx->kernel_mul ;
3905
+ } else {
3906
+ kernel = backend_ctx->kernel_mul_f16 ;
3907
+ }
3868
3908
3869
3909
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3870
3910
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
@@ -3926,6 +3966,10 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3926
3966
GGML_ASSERT (dst);
3927
3967
GGML_ASSERT (dst->extra );
3928
3968
3969
+ GGML_ASSERT (src0->type == src1->type );
3970
+ GGML_ASSERT (src0->type == dst->type );
3971
+ GGML_ASSERT (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
3972
+
3929
3973
const int ne00 = src0->ne [0 ];
3930
3974
const int ne01 = src0->ne [1 ];
3931
3975
const int ne02 = src0->ne [2 ];
@@ -3974,7 +4018,12 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3974
4018
3975
4019
bcast_row = true ;
3976
4020
int ne = ne00 / 4 ;
3977
- kernel = backend_ctx->kernel_div_row ;
4021
+
4022
+ if (src0->type == GGML_TYPE_F32) {
4023
+ kernel = backend_ctx->kernel_div_row ;
4024
+ } else {
4025
+ kernel = backend_ctx->kernel_div_row_f16 ;
4026
+ }
3978
4027
3979
4028
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3980
4029
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
@@ -3984,7 +4033,11 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
3984
4033
CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
3985
4034
CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne));
3986
4035
} else {
3987
- kernel = backend_ctx->kernel_div ;
4036
+ if (src0->type == GGML_TYPE_F32) {
4037
+ kernel = backend_ctx->kernel_div ;
4038
+ } else {
4039
+ kernel = backend_ctx->kernel_div_f16 ;
4040
+ }
3988
4041
3989
4042
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3990
4043
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
@@ -4034,6 +4087,10 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
4034
4087
GGML_ASSERT (dst);
4035
4088
GGML_ASSERT (dst->extra );
4036
4089
4090
+ GGML_ASSERT (src0->type == src1->type );
4091
+ GGML_ASSERT (src0->type == dst->type );
4092
+ GGML_ASSERT (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
4093
+
4037
4094
const int ne00 = src0->ne [0 ];
4038
4095
const int ne01 = src0->ne [1 ];
4039
4096
const int ne02 = src0->ne [2 ];
@@ -4082,7 +4139,12 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
4082
4139
4083
4140
bcast_row = true ;
4084
4141
int ne = ne00 / 4 ;
4085
- kernel = backend_ctx->kernel_sub_row ;
4142
+
4143
+ if (src0->type == GGML_TYPE_F32) {
4144
+ kernel = backend_ctx->kernel_sub_row ;
4145
+ } else {
4146
+ kernel = backend_ctx->kernel_sub_row_f16 ;
4147
+ }
4086
4148
4087
4149
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
4088
4150
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
@@ -4092,7 +4154,11 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
4092
4154
CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
4093
4155
CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (int ), &ne));
4094
4156
} else {
4095
- kernel = backend_ctx->kernel_sub ;
4157
+ if (src0->type == GGML_TYPE_F32) {
4158
+ kernel = backend_ctx->kernel_sub ;
4159
+ } else {
4160
+ kernel = backend_ctx->kernel_sub_f16 ;
4161
+ }
4096
4162
4097
4163
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
4098
4164
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
0 commit comments