Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ggml/src/ggml-opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ set(GGML_OPENCL_KERNELS
pad
repeat
mul_mat_f16_f32
mul_mat_f16_f32_image
pack_a_for_image
pack_b_for_image
)

foreach (K ${GGML_OPENCL_KERNELS})
Expand Down
178 changes: 178 additions & 0 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,8 @@ struct ggml_backend_opencl_context {

cl_int alignment;
size_t max_alloc_size;
size_t max_image_width;
size_t max_image_height;
bool fp16_support;
bool has_vector_subgroup_broadcast;
ggml_cl_compiler_version adreno_cl_compiler_version;
Expand Down Expand Up @@ -369,6 +371,10 @@ struct ggml_backend_opencl_context {
cl_program program_mul_mv_f32_f32;
cl_program program_mul;
cl_program program_mul_mat_f16_f32_tiled;
cl_program program_mul_mat_f16_f32_image;
cl_program program_pack_a_for_image;
cl_program program_pack_b_for_image;
cl_ulong global_mem_size;
cl_program program_div;
cl_program program_sub;
cl_program program_norm;
Expand Down Expand Up @@ -424,6 +430,9 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_f16_f32;
cl_kernel kernel_mul_mat_f16_f32_l4;
cl_kernel kernel_mul_mat_f16_f32_tiled;
cl_kernel kernel_mul_mat_f16_f32_image;
cl_kernel kernel_pack_a_for_image;
cl_kernel kernel_pack_b_for_image;
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
Expand Down Expand Up @@ -1033,6 +1042,54 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}

// mul_mat_f16_f32_image
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src{
#include "mul_mat_f16_f32_image.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mat_f16_f32_image.cl");
#endif
backend_ctx->program_mul_mat_f16_f32_image =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_image = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_image, "mul_mat_f16_f32_image", &err), err));
GGML_LOG_CONT(".");
}

// pack_a_for_image
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src{
#include "pack_a_for_image.cl.h"
};
#else
const std::string kernel_src = read_file("pack_a_for_image.cl");
#endif
backend_ctx->program_pack_a_for_image =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

CL_CHECK((backend_ctx->kernel_pack_a_for_image = clCreateKernel(backend_ctx->program_pack_a_for_image, "pack_a_for_image", &err), err));
GGML_LOG_CONT(".");
}

// pack_b_for_image
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src{
#include "pack_b_for_image.cl.h"
};
#else
const std::string kernel_src = read_file("pack_b_for_image.cl");
#endif
backend_ctx->program_pack_b_for_image =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

CL_CHECK((backend_ctx->kernel_pack_b_for_image = clCreateKernel(backend_ctx->program_pack_b_for_image, "pack_b_for_image", &err), err));
GGML_LOG_CONT(".");
}

// mul
{
#ifdef GGML_OPENCL_EMBED_KERNELS
Expand Down Expand Up @@ -1987,6 +2044,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);

CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &backend_ctx->global_mem_size, NULL));

CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &backend_ctx->max_image_width, NULL));
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &backend_ctx->max_image_height, NULL));

// Check SVM.
cl_device_svm_capabilities svm_caps;
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
Expand Down Expand Up @@ -4997,6 +5059,93 @@ static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_ten
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
}

static void ggml_cl_mul_mat_f16_f32_image(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_context context = backend_ctx->context;
cl_command_queue queue = backend_ctx->queue;
cl_int err = 0;

const int M = src0->ne[1];
const int N = src1->ne[1];
const int K = src0->ne[0];
const int K_4 = (K + 3) / 4;
const int N_4 = (N + 3) / 4;

ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;

cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;

cl_mem a_image = NULL, b_image = NULL;
cl_event pack_events[2];
cl_event matmul_event;

// Create image for A
cl_image_format format_A = {CL_RGBA, CL_HALF_FLOAT};
cl_image_desc desc_A = {};
desc_A.image_type = CL_MEM_OBJECT_IMAGE2D;
desc_A.image_width = K_4;
desc_A.image_height = M;
a_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_A, &desc_A, NULL, &err);
CL_CHECK(err);

// Create image for B
cl_image_format format_B = {CL_RGBA, CL_HALF_FLOAT};
cl_image_desc desc_B = {};
desc_B.image_type = CL_MEM_OBJECT_IMAGE2D;
desc_B.image_width = N_4;
desc_B.image_height = K;
b_image = clCreateImage(context, CL_MEM_READ_WRITE, &format_B, &desc_B, NULL, &err);
CL_CHECK(err);

// Launch packing kernel for A
cl_kernel pack_a_kernel = backend_ctx->kernel_pack_a_for_image;
CL_CHECK(clSetKernelArg(pack_a_kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(pack_a_kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(pack_a_kernel, 2, sizeof(cl_mem), &a_image));
CL_CHECK(clSetKernelArg(pack_a_kernel, 3, sizeof(int), &M));
CL_CHECK(clSetKernelArg(pack_a_kernel, 4, sizeof(int), &K));
const size_t pack_a_gws[2] = { (size_t)K_4, (size_t)M };
CL_CHECK(clEnqueueNDRangeKernel(queue, pack_a_kernel, 2, NULL, pack_a_gws, NULL, 0, NULL, &pack_events[0]));

// Launch packing kernel for B
cl_kernel pack_b_kernel = backend_ctx->kernel_pack_b_for_image;
CL_CHECK(clSetKernelArg(pack_b_kernel, 0, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(pack_b_kernel, 1, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(pack_b_kernel, 2, sizeof(cl_mem), &b_image));
CL_CHECK(clSetKernelArg(pack_b_kernel, 3, sizeof(int), &K));
CL_CHECK(clSetKernelArg(pack_b_kernel, 4, sizeof(int), &N));
const size_t pack_b_gws[2] = { (size_t)N_4, (size_t)K };
CL_CHECK(clEnqueueNDRangeKernel(queue, pack_b_kernel, 2, NULL, pack_b_gws, NULL, 0, NULL, &pack_events[1]));

// Launch matmul kernel
cl_kernel matmul_kernel = backend_ctx->kernel_mul_mat_f16_f32_image;
CL_CHECK(clSetKernelArg(matmul_kernel, 0, sizeof(cl_mem), &a_image));
CL_CHECK(clSetKernelArg(matmul_kernel, 1, sizeof(cl_mem), &b_image));
CL_CHECK(clSetKernelArg(matmul_kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(matmul_kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(matmul_kernel, 4, sizeof(int), &M));
CL_CHECK(clSetKernelArg(matmul_kernel, 5, sizeof(int), &N));
CL_CHECK(clSetKernelArg(matmul_kernel, 6, sizeof(int), &K));

const int OPWM = 64;
const int OPWN = 64;
const size_t lws[2] = { 16, 8 }; // WG_M, WG_N
const size_t gws[2] = { (size_t)ceil((float)M / OPWM) * lws[0], (size_t)ceil((float)N / OPWN) * lws[1] };
CL_CHECK(clEnqueueNDRangeKernel(queue, matmul_kernel, 2, NULL, gws, lws, 2, pack_events, &matmul_event));

// Wait for matmul to finish and release resources
CL_CHECK(clWaitForEvents(1, &matmul_event));
CL_CHECK(clReleaseEvent(pack_events[0]));
CL_CHECK(clReleaseEvent(pack_events[1]));
CL_CHECK(clReleaseEvent(matmul_event));
CL_CHECK(clReleaseMemObject(a_image));
CL_CHECK(clReleaseMemObject(b_image));
}

static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
Expand All @@ -5010,6 +5159,35 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co

ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;

if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
backend_ctx->gpu_family == ADRENO && backend_ctx->kernel_mul_mat_f16_f32_image != NULL &&
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
src0->ne[2] == 1 && src0->ne[3] == 1 &&
src1->ne[2] == 1 && src1->ne[3] == 1) {

const int M = src0->ne[1];
const int N = src1->ne[1];
const int K = src0->ne[0];

// Performance thresholds: only use for reasonably large matrices
// where the GPU speedup can outweigh the CPU-side transpose/packing overhead.
if (M > 32 && N > 32 && K > 32) {
const size_t n_padded_4 = (size_t)((N + 3) / 4);
const size_t temp_a_size = (size_t)M * K * sizeof(ggml_fp16_t);
const size_t temp_b_size = n_padded_4 * K * 4 * sizeof(ggml_fp16_t); // RGBA
const size_t total_temp_image_size = temp_a_size + temp_b_size;

// Safety checks for memory and device limits
if ((size_t)K <= backend_ctx->max_image_width &&
(size_t)M <= backend_ctx->max_image_height &&
n_padded_4 <= backend_ctx->max_image_height &&
total_temp_image_size < (backend_ctx->global_mem_size / 4)) { // Ensure temp images use < 25% of total VRAM
ggml_cl_mul_mat_f16_f32_image(backend, src0, src1, dst);
return;
}
}
}

if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
src0->ne[1] > 32 && // M > 32
src1->ne[1] > 32 && // N > 32
Expand Down
61 changes: 61 additions & 0 deletions ggml/src/ggml-opencl/kernels/mul_mat_f16_f32_image.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

__kernel void mul_mat_f16_f32_image(
__read_only image2d_t A_img,
__read_only image2d_t B_img,
__global float* C_buf,
const ulong c_offset,
const int M,
const int N,
const int K
) {
const int n_4_idx = get_global_id(0);
const int m_idx = get_global_id(1);

const int n_base = n_4_idx << 2;

if (n_base >= N || m_idx >= M) {
return;
}

float4 c_vals = (float4)(0.0f);
const int K_4 = (K + 3) / 4;

for (int k_4_idx = 0; k_4_idx < K_4; ++k_4_idx) {
int k_base = k_4_idx << 2;

float4 a_vals = convert_float4(read_imageh(A_img, SAMPLER, (int2)(k_4_idx, m_idx)));

if (k_base < K) {
float4 b0 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 0)));
c_vals = mad(a_vals.x, b0, c_vals);
}
if (k_base + 1 < K) {
float4 b1 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 1)));
c_vals = mad(a_vals.y, b1, c_vals);
}
if (k_base + 2 < K) {
float4 b2 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 2)));
c_vals = mad(a_vals.z, b2, c_vals);
}
if (k_base + 3 < K) {
float4 b3 = convert_float4(read_imageh(B_img, SAMPLER, (int2)(n_4_idx, k_base + 3)));
c_vals = mad(a_vals.w, b3, c_vals);
}
}

__global float* C = (__global float*)((__global char*)C_buf + c_offset);

if (n_base + 3 < N) {
C[(n_base + 0) * M + m_idx] = c_vals.x;
C[(n_base + 1) * M + m_idx] = c_vals.y;
C[(n_base + 2) * M + m_idx] = c_vals.z;
C[(n_base + 3) * M + m_idx] = c_vals.w;
} else {
if (n_base < N) C[n_base * M + m_idx] = c_vals.x;
if (n_base + 1 < N) C[(n_base + 1) * M + m_idx] = c_vals.y;
if (n_base + 2 < N) C[(n_base + 2) * M + m_idx] = c_vals.z;
}
}
29 changes: 29 additions & 0 deletions ggml/src/ggml-opencl/kernels/pack_a_for_image.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

__kernel void pack_a_for_image(
__global const half* src_a,
const ulong a_offset,
__write_only image2d_t dest_img,
const int M,
const int K
) {
const int k_4_idx = get_global_id(0);
const int m_idx = get_global_id(1);

const int k_base = k_4_idx << 2;

if (k_base >= K || m_idx >= M) {
return;
}

__global const half* a_ptr = (__global const half*)((__global const char*)src_a + a_offset);
const int a_idx_base = m_idx * K + k_base;

half4 vals;
vals.x = a_ptr[a_idx_base];
vals.y = (k_base + 1 < K) ? a_ptr[a_idx_base + 1] : (half)0.0h;
vals.z = (k_base + 2 < K) ? a_ptr[a_idx_base + 2] : (half)0.0h;
vals.w = (k_base + 3 < K) ? a_ptr[a_idx_base + 3] : (half)0.0h;

write_imageh(dest_img, (int2)(k_4_idx, m_idx), vals);
}
28 changes: 28 additions & 0 deletions ggml/src/ggml-opencl/kernels/pack_b_for_image.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

__kernel void pack_b_for_image(
__global const float* src_b,
const ulong b_offset,
__write_only image2d_t dest_img,
const int K,
const int N
) {
const int n_4_idx = get_global_id(0);
const int k_idx = get_global_id(1);

const int n_base = n_4_idx << 2;

if (n_base >= N || k_idx >= K) {
return;
}

__global const float* b_ptr = (__global const float*)((__global const char*)src_b + b_offset);

half4 vals;
vals.x = convert_half(b_ptr[n_base * K + k_idx]);
vals.y = (n_base + 1 < N) ? convert_half(b_ptr[(n_base + 1) * K + k_idx]) : (half)0.0h;
vals.z = (n_base + 2 < N) ? convert_half(b_ptr[(n_base + 2) * K + k_idx]) : (half)0.0h;
vals.w = (n_base + 3 < N) ? convert_half(b_ptr[(n_base + 3) * K + k_idx]) : (half)0.0h;

write_imageh(dest_img, (int2)(n_4_idx, k_idx), vals);
}
Loading