@@ -2885,7 +2885,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
2885
2885
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
2886
2886
}
2887
2887
}
2888
- ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 9 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
2888
+ ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 12 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
2889
2889
2890
2890
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
2891
2891
ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
@@ -5821,7 +5821,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5821
5821
const uint64_t ne00 = src0->ne[0];
5822
5822
const uint64_t ne01 = src0->ne[1];
5823
5823
const uint64_t ne02 = src0->ne[2];
5824
- // const uint64_t ne03 = src0->ne[3];
5824
+ const uint64_t ne03 = src0->ne[3];
5825
5825
5826
5826
const uint64_t nb01 = src0->nb[1];
5827
5827
const uint64_t nb02 = src0->nb[2];
@@ -5833,7 +5833,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5833
5833
const uint64_t ne12 = src1->ne[2];
5834
5834
// const uint64_t ne13 = src1->ne[3];
5835
5835
5836
+ const uint32_t nb03 = (uint32_t)(src0->nb[3] / sizeof(ggml_fp16_t));
5837
+ const uint32_t nb13 = (uint32_t)(src1->nb[3] / sizeof(float));
5838
+ const uint32_t nb23 = (uint32_t)(dst->nb[3] / sizeof(float));
5839
+
5836
5840
GGML_ASSERT(ne11 == 1);
5841
+ GGML_ASSERT(src0->ne[3] == src1->ne[3]); // checked in supports_op
5837
5842
5838
5843
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
5839
5844
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
@@ -5849,7 +5854,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5849
5854
src1_uma = d_Qy != nullptr;
5850
5855
}
5851
5856
5852
- const uint64_t d_ne = ne01 * ne11 * ne12;
5857
+ const uint64_t d_ne = ne01 * ne11 * ne12 * ne03 ;
5853
5858
5854
5859
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
5855
5860
const uint32_t channel_stride_x = nb02 / sizeof(ggml_fp16_t);
@@ -5884,10 +5889,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
5884
5889
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
5885
5890
5886
5891
// compute
5887
- const std::array<uint32_t, 9 > pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
5892
+ const std::array<uint32_t, 12 > pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 };
5888
5893
ggml_vk_sync_buffers(subctx);
5889
5894
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
5890
- { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1 , (uint32_t)ne01, (uint32_t)ne12 });
5895
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03 , (uint32_t)ne01, (uint32_t)ne12 });
5891
5896
}
5892
5897
5893
5898
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
0 commit comments