diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp index 04771fbe8ca186..aa60d61f133596 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp @@ -200,6 +200,8 @@ struct format { os_is_zyx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution + os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2, ///< format for weights for MMAD fsv32 convolution + os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2, ///< format for weights for MMAD fsv32 convolution os_is_zyx_osa4_isa8_osv8_isv4, ///< format for weights for MMAD fsv32 convolution os_is_yx_osa4_isa8_osv8_isv4, ///< format for weights for MMAD fsv32 convolution os_is_yx_osv16_isv4, ///< format for weights for IMAD convolutions diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 9833b750caf06c..2adf2c67d3f5ef 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -538,6 +538,10 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { return kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4; + case format::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2: + return kernel_selector::weights_layout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2; + case format::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2: + return kernel_selector::weights_layout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2; case format::os_is_yx_osv16_isv4: return kernel_selector::weights_layout::os_is_yx_osv16_isv4; case format::os_is_yx_osv32_isv4_swizzled_by_2: @@ -728,6 +732,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { return cldnn::format::os_is_zyx_isa8_osv16_isv4; case kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return cldnn::format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; + case kernel_selector::weights_layout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2: + return cldnn::format::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2; + case kernel_selector::weights_layout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2: + return cldnn::format::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2; case kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return cldnn::format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4; case kernel_selector::weights_layout::os_is_yx_osv32_isv4_swizzled_by_2: diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl index 0620115cfab300..af40bf5f68eb60 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl @@ -27,7 +27,9 @@ #define ACTIVATION_TYPE_VEC float8 #define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x) #define MMAD MMAD_8x8 +#if SUB_GROUP_SIZE == 8 #define BLOCK_WRITE(ptr, val) _sub_group_block_write8((__global uint*)(ptr), as_uint8(val)); +#endif #elif OUTPUT_X_BLOCK_SIZE == 4 #define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4) #define ACCUMULATOR_TYPE_VEC int4 @@ -35,7 +37,9 @@ #define ACTIVATION_TYPE_VEC float4 #define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x) #define MMAD MMAD_4x8 +#if SUB_GROUP_SIZE == 8 #define BLOCK_WRITE(ptr, val) _sub_group_block_write4((__global uint*)(ptr), as_uint4(val)); +#endif #else #error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported block size" #endif @@ -50,7 +54,8 @@ #error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported FILTER_TYPE" #endif -__attribute__((reqd_work_group_size(8, OW_GROUP, 1))) + +__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, OW_GROUP, 1))) REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) KERNEL(convolution_mmad_b_fs_yx_fsv32)( __global INPUT0_TYPE* input, @@ -89,13 +94,16 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)( const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; const int input_z = z * STRIDE_SIZE_Z - PADDING_SIZE_Z; - ACCUMULATOR_TYPE_VEC acc[4] = { 0 }; // 4*8 packed channels * OUTPUT_X_BLOCK_SIZE + ACCUMULATOR_TYPE_VEC acc[OF_TO_DO] = { 0 }; // OF_TO_DO*8 packed channels * OUTPUT_X_BLOCK_SIZE #if ASYMMETRIC_WEIGHTS_QUANTIZATION ACCUMULATOR_TYPE_VEC acc_assym_weights = 0; #endif +#if INPUT0_DIMS == 5 + const uint input_offset = INPUT0_GET_INDEX(b,0,0,0,0); +#else //INPUT0_DIMS == 5 const uint input_offset = INPUT0_GET_INDEX(b,0,0,0); - +#endif //INPUT0_DIMS == 5 uint filter_idx = fg * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * ISV_SIZE * OSV_SIZE * IFM_BLOCKS; const int input_x_pitch = ISV_SIZE; @@ -106,12 +114,13 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)( for (int icb = 0; icb < IFM_BLOCKS; ++icb) { #if ASYMMETRIC_WEIGHTS_QUANTIZATION - uchar4 m; - __attribute__((opencl_unroll_hint(4))) - for (int i = 0; i < 4; i++) { - m[i] = icb*32 + lid*4 + i < INPUT0_FEATURE_NUM; + uchar4 m = { 0, 0, 0, 0 }; + __attribute__((opencl_unroll_hint(OF_TO_DO))) + for (int i = 0; i < OF_TO_DO; i++) { + m[i] = icb*32 + lid*OF_TO_DO + i < INPUT0_FEATURE_NUM; } int mm = as_int(m); +#if SUB_GROUP_SIZE == 8 int8 multiplier = (int8)(sub_group_broadcast(mm, 0), sub_group_broadcast(mm, 1), sub_group_broadcast(mm, 2), @@ -120,6 +129,16 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)( sub_group_broadcast(mm, 5), sub_group_broadcast(mm, 6), sub_group_broadcast(mm, 7)); +#else //SUB_GROUP_SIZE == 8 + int8 multiplier = (int8)(sub_group_broadcast(mm, 0) + (sub_group_broadcast(mm, 1) << 16), + sub_group_broadcast(mm, 2) + (sub_group_broadcast(mm, 3) << 16), + sub_group_broadcast(mm, 4) + (sub_group_broadcast(mm, 5) << 16), + sub_group_broadcast(mm, 6) + (sub_group_broadcast(mm, 7) << 16), + sub_group_broadcast(mm, 8) + (sub_group_broadcast(mm, 9) << 16), + sub_group_broadcast(mm, 10) + (sub_group_broadcast(mm, 11) << 16), + sub_group_broadcast(mm, 12) + (sub_group_broadcast(mm, 13) << 16), + sub_group_broadcast(mm, 14) + (sub_group_broadcast(mm, 15) << 16)); +#endif #endif __attribute__((opencl_unroll_hint(FILTER_SIZE_Z))) @@ -142,23 +161,24 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)( int xb = 0; for (; xb < INPUT_LINE_SIZE; xb++) { - - bool x_cross_fm = input_x + xb < 0 || input_x + xb >= INPUT0_SIZE_X; - if (y_cross_fm || x_cross_fm || z_cross_fm) { + if (lid < 8){ + bool x_cross_fm = input_x + xb < 0 || input_x + xb >= INPUT0_SIZE_X; + if (y_cross_fm || x_cross_fm || z_cross_fm) { #if ASYMMETRIC_DATA_QUANTIZATION - const int azp_idx = (icb*ISV_SIZE + 4*lid) % ACTIVATIONS_ZERO_POINTS_FEATURE_NUM; - line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, ((const __global uint*)(activations_zp + azp_idx))[0]); + const int azp_idx = (icb*ISV_SIZE + 4*lid) % ACTIVATIONS_ZERO_POINTS_FEATURE_NUM; + line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, ((const __global uint*)(activations_zp + azp_idx))[0]); #else - line_cache[xb] = 0; + line_cache[xb] = 0; #endif - } - else - { - line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, _sub_group_block_read((const __global uint*)(input + in_addr + - icb * input_fs_pitch + - kd * DILATION_SIZE_Z * input_z_pitch + - kh * DILATION_SIZE_Y * input_y_pitch + - xb * input_x_pitch))); + } + else + { + line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, _sub_group_block_read((const __global uint*)(input + in_addr + + icb * input_fs_pitch + + kd * DILATION_SIZE_Z * input_z_pitch + + kh * DILATION_SIZE_Y * input_y_pitch + + xb * input_x_pitch))); + } } } } @@ -175,16 +195,23 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)( + kh * ISV_SIZE * OSV_SIZE * FILTER_SIZE_X + kw * ISV_SIZE * OSV_SIZE; + +#if SUB_GROUP_SIZE == 8 MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data0 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE))); MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data1 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE))); MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data2 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE))); MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data3 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE))); +#else //SUB_GROUP_SIZE == 8 + MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data0 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 0*16*ISV_SIZE))); + MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data1 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 1*16*ISV_SIZE))); +#endif acc[0] = MMAD(src, weights_data0, acc[0]); // 8 elements in 4*lid+0 out channel acc[1] = MMAD(src, weights_data1, acc[1]); // 8 elements in 4*lid+1 out channel +#if SUB_GROUP_SIZE == 8 acc[2] = MMAD(src, weights_data2, acc[2]); // 8 elements in 4*lid+2 out channel acc[3] = MMAD(src, weights_data3, acc[3]); // 8 elements in 4*lid+3 out channel - +#endif #if ASYMMETRIC_WEIGHTS_QUANTIZATION acc_assym_weights = MMAD(src, multiplier, acc_assym_weights); #endif @@ -198,62 +225,76 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)( #endif #if OUTPUT_IS_FP - MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) dst[4]; + MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) dst[OF_TO_DO]; for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { #if BIAS_TERM - ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+0]); - ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+1]); - ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+2]); - ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+3]); + ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+0]); + ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+1]); +#if SUB_GROUP_SIZE == 8 + ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+2]); + ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+3]); +#endif #else ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]); ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]); +#if SUB_GROUP_SIZE == 8 ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]); ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]); #endif +#endif #if ASYMMETRIC_WEIGHTS_QUANTIZATION - const uint idx0 = fg*OSV_SIZE + 4*lid + 0; - const uint idx1 = fg*OSV_SIZE + 4*lid + 1; - const uint idx2 = fg*OSV_SIZE + 4*lid + 2; - const uint idx3 = fg*OSV_SIZE + 4*lid + 3; + const uint idx0 = fg*OSV_SIZE + OF_TO_DO*lid + 0; + const uint idx1 = fg*OSV_SIZE + OF_TO_DO*lid + 1; +#if SUB_GROUP_SIZE == 8 + const uint idx2 = fg*OSV_SIZE + OF_TO_DO*lid + 2; + const uint idx3 = fg*OSV_SIZE + OF_TO_DO*lid + 3; +#endif res0 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx0]); res1 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx1]); +#if SUB_GROUP_SIZE == 8 res2 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx2]); res3 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx3]); +#endif #endif // ASYMMETRIC_WEIGHTS_QUANTIZATION #if ASYMMETRIC_DATA_QUANTIZATION - res0 += compensation[fg*OSV_SIZE + 4*lid + 0]; - res1 += compensation[fg*OSV_SIZE + 4*lid + 1]; - res2 += compensation[fg*OSV_SIZE + 4*lid + 2]; - res3 += compensation[fg*OSV_SIZE + 4*lid + 3]; + res0 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 0]; + res1 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 1]; +#if SUB_GROUP_SIZE == 8 + res2 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 2]; + res3 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 3]; +#endif #endif // ASYMMETRIC_DATA_QUANTIZATION #if HAS_FUSED_OPS { FUSED_OPS_0; dst[0][i] = FUSED_OPS_RESULT_0; }; { FUSED_OPS_1; dst[1][i] = FUSED_OPS_RESULT_1; }; +#if SUB_GROUP_SIZE == 8 { FUSED_OPS_2; dst[2][i] = FUSED_OPS_RESULT_2; }; { FUSED_OPS_3; dst[3][i] = FUSED_OPS_RESULT_3; }; +#endif #else dst[0][i] = TO_OUTPUT_TYPE(res0); dst[1][i] = TO_OUTPUT_TYPE(res1); +#if SUB_GROUP_SIZE == 8 dst[2][i] = TO_OUTPUT_TYPE(res2); dst[3][i] = TO_OUTPUT_TYPE(res3); +#endif #endif } for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { - for (int ofm = 0; ofm < 4; ofm++) { + for (int ofm = 0; ofm < OF_TO_DO; ofm++) { #if OUTPUT_DIMS == 5 - const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + 4*lid, z, y, x+i); + const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + OF_TO_DO*lid, z, y, x+i); #elif OUTPUT_DIMS <= 4 - const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + 4*lid, y, x+i); + const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + OF_TO_DO*lid, y, x+i); #endif bool full_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + i < OUTPUT_SIZE_X; - bool full_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + 4 * lid + ofm < OUTPUT_FEATURE_NUM; + bool full_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + OF_TO_DO * lid + ofm < OUTPUT_FEATURE_NUM; if (full_x && full_f) { output[dst_index] = dst[ofm][i]; } @@ -264,92 +305,113 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)( for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { #if BIAS_TERM - ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+0]); - ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+1]); - ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+2]); - ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+3]); + ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+0]); + ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+1]); +#if SUB_GROUP_SIZE == 8 + ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+2]); + ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+3]); +#endif #else ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]); ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]); +#if SUB_GROUP_SIZE == 8 ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]); ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]); #endif +#endif #if ASYMMETRIC_WEIGHTS_QUANTIZATION - const uint idx0 = fg*OSV_SIZE + 4*lid + 0; - const uint idx1 = fg*OSV_SIZE + 4*lid + 1; - const uint idx2 = fg*OSV_SIZE + 4*lid + 2; - const uint idx3 = fg*OSV_SIZE + 4*lid + 3; - + const uint idx0 = fg*OSV_SIZE + OF_TO_DO*lid + 0; + const uint idx1 = fg*OSV_SIZE + OF_TO_DO*lid + 1; +#if SUB_GROUP_SIZE == 8 + const uint idx2 = fg*OSV_SIZE + OF_TO_DO*lid + 2; + const uint idx3 = fg*OSV_SIZE + OF_TO_DO*lid + 3; +#endif res0 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx0]); res1 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx1]); +#if SUB_GROUP_SIZE == 8 res2 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx2]); res3 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx3]); +#endif #endif // ASYMMETRIC_WEIGHTS_QUANTIZATION #if ASYMMETRIC_DATA_QUANTIZATION - res0 += compensation[fg*OSV_SIZE + 4*lid + 0]; - res1 += compensation[fg*OSV_SIZE + 4*lid + 1]; - res2 += compensation[fg*OSV_SIZE + 4*lid + 2]; - res3 += compensation[fg*OSV_SIZE + 4*lid + 3]; + res0 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 0]; + res1 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 1]; +#if SUB_GROUP_SIZE == 8 + res2 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 2]; + res3 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 3]; +#endif #endif // ASYMMETRIC_DATA_QUANTIZATION - MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) pack; + MAKE_VECTOR_TYPE(OUTPUT_TYPE, OF_TO_DO) pack; #if HAS_FUSED_OPS { FUSED_OPS_0; pack[0] = FUSED_OPS_RESULT_0; }; { FUSED_OPS_1; pack[1] = FUSED_OPS_RESULT_1; }; +#if SUB_GROUP_SIZE == 8 { FUSED_OPS_2; pack[2] = FUSED_OPS_RESULT_2; }; { FUSED_OPS_3; pack[3] = FUSED_OPS_RESULT_3; }; +#endif #else pack[0] = TO_OUTPUT_TYPE(res0); pack[1] = TO_OUTPUT_TYPE(res1); +#if SUB_GROUP_SIZE == 8 pack[2] = TO_OUTPUT_TYPE(res2); pack[3] = TO_OUTPUT_TYPE(res3); +#endif #endif dst[i] = AS_PACKED_OUT_TYPE(pack); } const bool full_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X; const bool full_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || (fg + 1) * OSV_SIZE <= OUTPUT_FEATURE_NUM; +#if SUB_GROUP_SIZE == 8 if (full_x && full_f) { #if OUTPUT_DIMS == 5 - const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, z, y, x)) / 4; + const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, z, y, x)) / OF_TO_DO; #elif OUTPUT_DIMS <= 4 - const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, y, x)) / 4; + const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, y, x)) / OF_TO_DO; #endif BLOCK_WRITE(output + dst_index, dst); } else { -#if OUTPUT_FEATURE_NUM % 4 == 0 +#endif //SUB_GROUP_SIZE == 8 +#if OUTPUT_FEATURE_NUM % OF_TO_DO == 0 for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { const bool full_it_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + i < OUTPUT_SIZE_X; - const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + 4 * lid < OUTPUT_FEATURE_NUM; + const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + OF_TO_DO * lid < OUTPUT_FEATURE_NUM; if (full_it_x && full_sgl_f) { # if OUTPUT_DIMS == 5 - const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid, z, y, x+i); + const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid, z, y, x+i); # elif OUTPUT_DIMS <= 4 - const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid, y, x+i); + const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid, y, x+i); # endif - output[dst_index/4] = dst[i]; + output[dst_index/OF_TO_DO] = dst[i]; } } -#else // OUTPUT_FEATURE_NUM % 4 == 0 +#else // OUTPUT_FEATURE_NUM % OF_TO_DO == 0 for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { - for (int ofm = 0; ofm < 4; ++ofm) { + for (int ofm = 0; ofm < OF_TO_DO; ++ofm) { const bool full_it_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + i < OUTPUT_SIZE_X; - const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + 4 * lid + ofm < OUTPUT_FEATURE_NUM; + const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + OF_TO_DO * lid + ofm < OUTPUT_FEATURE_NUM; if (full_it_x && full_sgl_f) { # if OUTPUT_DIMS == 5 - const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid + ofm, z, y, x+i); + const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid + ofm, z, y, x+i); # elif OUTPUT_DIMS <= 4 - const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid + ofm, y, x+i); + const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid + ofm, y, x+i); # endif +#if SUB_GROUP_SIZE == 8 ((__global uchar*)output)[dst_index] = as_uchar4(dst[i])[ofm]; +#else + ((__global uchar*)output)[dst_index] = as_uchar2(dst[i])[ofm]; +#endif } } } #endif // OUTPUT_FEATURE_NUM % 4 == 0 +#if SUB_GROUP_SIZE == 8 } +#endif //SUB_GROUP_SIZE == 8 #endif // OUTPUT_IS_FP } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl index 08d9229a4b109c..3d2449ab7ebf0d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl @@ -794,6 +794,65 @@ inline uint get_os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4_index(uint o, uint i, return idx; } +inline uint get_os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2_index( + uint o, uint i, uint y, uint x, + uint size_x, uint size_y, + uint size_ifm, uint size_ofm, + uint offset) +{ + const uint o_swizzled = (o % 2) * 16 + ((o % 32) / 2) + (o / 32) * 32; + const uint isv_idx = i % 4; + const uint isa_idx = (i / 4) % 8; + const uint is_idx = i / 32; + const uint osv_idx = o_swizzled % 16; + const uint osa_idx = (o_swizzled / 16) % 2; + const uint os_idx = o / 32; + + const uint f_32_aligned = (size_ifm + 31) / 32; + + size_t idx = offset + + isv_idx + + osv_idx * 4 + + isa_idx * 16 * 4 + + osa_idx * 16 * 32 + + x * 32 * 32 + + y * size_x * 32 * 32 + + is_idx * 32 * 32 * size_x * size_y + + os_idx * 32 * 32 * f_32_aligned * size_x * size_y; + + return idx; +} + +inline uint get_os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2_index( + uint o, uint i, uint z, uint y, uint x, + uint size_x, uint size_y, uint size_z, + uint size_ifm, uint size_ofm, + uint offset) +{ + const uint o_swizzled = (o % 2) * 16 + ((o % 32) / 2) + (o / 32) * 32; + const uint isv_idx = i % 4; + const uint isa_idx = (i / 4) % 8; + const uint is_idx = i / 32; + const uint osv_idx = o_swizzled % 16; + const uint osa_idx = (o_swizzled / 16) % 2; + const uint os_idx = o / 32; + + const uint f_32_aligned = (size_ifm + 31) / 32; + + size_t idx = offset + + isv_idx + + osv_idx * 4 + + isa_idx * 16 * 4 + + osa_idx * 16 * 32 + + x * 32 * 32 + + y * size_x * 32 * 32 + + z * size_x * size_y * 32 * 32 + + is_idx * 32 * 32 * size_x * size_y * size_z + + os_idx * 32 * 32 * f_32_aligned * size_x * size_y * size_z; + + return idx; +} + inline uint get_os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4_index(uint o, uint i, uint z, uint y, uint x, uint size_x, uint size_y, uint size_z, uint size_ifm, uint size_ofm, uint offset) @@ -985,6 +1044,25 @@ inline uint get_g_is_os_yx_isa4_osa8_isv8_osv4(uint g, uint o, uint i, uint z, u CAT(prefix, _OFM_NUM), \ CAT(prefix, _OFFSET)) +#define GET_FILTER_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(prefix, o, i, y, x) \ + get_os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2_index( \ + o, i, y, x, \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _SIZE_Y), \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + CAT(prefix, _OFFSET)) + +#define GET_FILTER_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(prefix, o, i, z, y, x) \ + get_os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2_index( \ + o, i, z, y, x, \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _SIZE_Y), \ + CAT(prefix, _SIZE_Z), \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + CAT(prefix, _OFFSET)) + inline uint get_is_o32_yx_isv32_swizzled_by_4_index(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size) { const uint o_aligned_to_32 = ((o_size + 31) / 32) * 32; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl index 900bae7fe41815..9d234456185a69 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl @@ -431,6 +431,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint return GET_FILTER_OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4 return GET_FILTER_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, z, y, x); +#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2 + return GET_FILTER_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(OUTPUT, o, i, y, x); +#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2 + return GET_FILTER_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(OUTPUT, o, i, z, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_YX_ISV16_OSV16 return GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(OUTPUT, o, i, y, x, 16, 16); #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISV16_OSV16 diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp index f6c1735b46df77..5fdc58556d3f6b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp @@ -347,6 +347,8 @@ std::string toString(WeightsLayout layout) { case WeightsLayout::os_is_zyx_isa8_osv16_isv4: return "OS_IS_ZYX_ISA8_OSV16_ISV4"; case WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; + case WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2: return "OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2"; + case WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2: return "OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2"; case WeightsLayout::os_is_yx_osv16_isv4: return "OS_IS_YX_OSV16_ISV4"; case WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2: return "OS_IS_YX_OSV32_ISV4_SWIZZLED_BY_2"; case WeightsLayout::os_is_yx_osv32_isv4: return "OS_IS_YX_OSV32_ISV4"; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp index b24cc863f844b3..5c75e73f981434 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp @@ -410,8 +410,8 @@ Datatype ConvolutionKernelBase::GetPackedInputType(const convolution_params& par return GetPackedType(params.inputs[0].GetDType()); } -Datatype ConvolutionKernelBase::GetPackedOutputType(const convolution_params& params) const { - return GetPackedType(params.outputs[0].GetDType()); +Datatype ConvolutionKernelBase::GetPackedOutputType(const convolution_params& params, size_t pack_size) const { + return GetPackedType(params.outputs[0].GetDType(), pack_size); } Datatype ConvolutionKernelBase::GetActivationType(const convolution_params& params) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h index 8e4806d2484b44..66aa127fd467e4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h @@ -77,7 +77,7 @@ class ConvolutionKernelBase : public WeightBiasKernelBase { Datatype GetPackedType(Datatype dt, size_t pack_size = 4) const; Datatype GetPackedInputType(const convolution_params& params) const; - Datatype GetPackedOutputType(const convolution_params& params) const; + Datatype GetPackedOutputType(const convolution_params& params, size_t pack_size = 4) const; Datatype GetActivationType(const convolution_params& params) const; Datatype GetAccumulatorType(const convolution_params& params) const; void GetUpdateDispatchDataFunc(KernelData& kd) const override; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp index 4627336aad8465..bf63e8ffd30f20 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp @@ -63,7 +63,7 @@ bool ConvolutionKernel_mmad_b_fs_yx_fsv32::Validate(const Params& p) const { DO_NOT_USE_THIS_KERNEL(p.layerID); } - if (!IsSIMDSizeSupported(params.engineInfo, 8)) + if (!IsSIMDSizeSupported(params.engineInfo, 8) && !IsSIMDSizeSupported(params.engineInfo, 16)) DO_NOT_USE_THIS_KERNEL(p.layerID); if (params.groups > 1) @@ -109,12 +109,18 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_b_fs_yx_fsv32::SetDef break; ow_group--; } - - dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 4; + if (IsSIMDSizeSupported(cp.engineInfo, 8)) { + dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 4; + } else { + dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 2; + } dispatchData.gws[1] = Align(CeilDiv(cp.outputs[0].X().v, dispatchData.cldnnStyle.blockWidth), ow_group) * cp.outputs[0].Y().v * cp.outputs[0].Z().v; dispatchData.gws[2] = cp.outputs[0].Batch().v; - - dispatchData.lws[0] = 8; + if (IsSIMDSizeSupported(cp.engineInfo, 8)) { + dispatchData.lws[0] = 8; + } else { + dispatchData.lws[0] = 16; + } dispatchData.lws[1] = ow_group; dispatchData.lws[2] = 1; @@ -144,7 +150,13 @@ JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32::GetJitConstants(const convolu jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size)); jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED_IN")); - jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params), "PACKED_OUT")); + if (IsSIMDSizeSupported(params.engineInfo, 8)) { + jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params), "PACKED_OUT")); + jit.AddConstant(MakeJitConstant("OF_TO_DO", 4)); + } else { + jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params, 2), "PACKED_OUT")); + jit.AddConstant(MakeJitConstant("OF_TO_DO", 2)); + } if (params.weights.GetDType() == WeightsType::INT8) { jit.AddConstant(MakeJitConstant("FILTER_TYPE_CHAR", 1)); } else if (params.weights.GetDType() == WeightsType::UINT8) { @@ -159,22 +171,26 @@ JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32::GetJitConstants(const convolu std::vector idx_order2; std::vector idx_order3; if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 4) { - idx_order0 = {"b", "(fg*32 + 4*lid+0)", "y", "(x+i)"}; - idx_order1 = {"b", "(fg*32 + 4*lid+1)", "y", "(x+i)"}; - idx_order2 = {"b", "(fg*32 + 4*lid+2)", "y", "(x+i)"}; - idx_order3 = {"b", "(fg*32 + 4*lid+3)", "y", "(x+i)"}; + idx_order0 = {"b", "(fg*32 + OF_TO_DO*lid+0)", "y", "(x+i)"}; + idx_order1 = {"b", "(fg*32 + OF_TO_DO*lid+1)", "y", "(x+i)"}; + idx_order2 = {"b", "(fg*32 + OF_TO_DO*lid+2)", "y", "(x+i)"}; + idx_order3 = {"b", "(fg*32 + OF_TO_DO*lid+3)", "y", "(x+i)"}; } else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 5) { - idx_order0 = {"b", "(fg*32 + 4*lid+0)", "z", "y", "(x+i)"}; - idx_order1 = {"b", "(fg*32 + 4*lid+1)", "z", "y", "(x+i)"}; - idx_order2 = {"b", "(fg*32 + 4*lid+2)", "z", "y", "(x+i)"}; - idx_order3 = {"b", "(fg*32 + 4*lid+3)", "z", "y", "(x+i)"}; + idx_order0 = {"b", "(fg*32 + OF_TO_DO*lid+0)", "z", "y", "(x+i)"}; + idx_order1 = {"b", "(fg*32 + OF_TO_DO*lid+1)", "z", "y", "(x+i)"}; + idx_order2 = {"b", "(fg*32 + OF_TO_DO*lid+2)", "z", "y", "(x+i)"}; + idx_order3 = {"b", "(fg*32 + OF_TO_DO*lid+3)", "z", "y", "(x+i)"}; } FusedOpsConfiguration conf0 = {"_0", idx_order0, "res0", input_dt, 1 }; FusedOpsConfiguration conf1 = {"_1", idx_order1, "res1", input_dt, 1 }; FusedOpsConfiguration conf2 = {"_2", idx_order2, "res2", input_dt, 1 }; FusedOpsConfiguration conf3 = {"_3", idx_order3, "res3", input_dt, 1 }; - jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1, conf2, conf3})); + if (IsSIMDSizeSupported(params.engineInfo, 8)) { + jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1, conf2, conf3})); + } else { + jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1})); + } } return jit; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h index 212298fc9f5410..51ff8efed783e5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h @@ -29,10 +29,18 @@ class ConvolutionKernel_mmad_b_fs_yx_fsv32 : public ConvolutionKernelBase { bool NeedPaddedInput() const override { return false; } WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override { - if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) { - return WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; + if (IsSIMDSizeSupported(p.engineInfo, 8)) { + if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) { + return WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; + } else { + return WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4; + } } else { - return WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4; + if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) { + return WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2; + } else { + return WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2; + } } } std::vector GetSupportedFusedOps() const override { diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp index aa2568c0367c95..3b247ddd9c8421 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp @@ -114,6 +114,8 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{ { WeightsLayout::os_is_zyx_isa8_osv16_isv4, { 0, 1, 2, 3, 4, -1 } }, { WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1 } }, { WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, 2, 3, 4, -1 } }, + { WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2, { 0, 1, -1, 2, 3, -1 } }, + { WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2, { 0, 1, 2, 3, 4, -1 } }, { WeightsLayout::os_is_yx_osv8_isv4, { 0, 1, -1, 2, 3, -1 } }, { WeightsLayout::os_is_yx_osv16_isv4, { 0, 1, -1, 2, 3, -1 } }, { WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2, { 0, 1, -1, 2, 3, -1 } }, @@ -629,11 +631,13 @@ NDims WeightsTensor::GetSimpleDims(const std::vector& d, WeightsLayout l newDims[4] = RoundUp(newDims[4], 16); break; case os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: + case os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2: assert(newDims.size() == 4); newDims[3] = RoundUp(newDims[3], 32); newDims[2] = RoundUp(newDims[2], 32); break; case os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: + case os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2: assert(newDims.size() == 5); newDims[4] = RoundUp(newDims[4], 32); newDims[3] = RoundUp(newDims[3], 32); diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h index e272cad2eaf654..3a080fe3606b33 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h +++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h @@ -140,6 +140,8 @@ enum WeightsLayout { // 1,5... os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, // 1,5... + os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2, + os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2, os_is_yx_osv16_isv4, // swizzled weights for convolution using IMAD os_is_yx_osv8_isv4, // weights for int8 blocked conv os_is_yx_osv32_isv4_swizzled_by_2, // weights for bfyx -> b_fs_yx_fsv32 convolution using IMAD with swizzled ofm (0, 2, 4..), (1, 3, 5...) diff --git a/src/plugins/intel_gpu/src/runtime/format.cpp b/src/plugins/intel_gpu/src/runtime/format.cpp index 7601897e1e1a7d..95a03f7e060554 100644 --- a/src/plugins/intel_gpu/src/runtime/format.cpp +++ b/src/plugins/intel_gpu/src/runtime/format.cpp @@ -107,6 +107,8 @@ static const std::map format_traits_map { FMT_TRAITS(os_is_zyx_isa8_osv16_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}, {{1, 8}, {0, 16}, {1, 4}}), // NOLINT FMT_TRAITS(os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 32}, {1, 32}}, {{0, 32}, {1, 32}}), // NOLINT FMT_TRAITS(os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{0, 32}, {1, 32}}, {{0, 32}, {1, 32}}), // NOLINT + FMT_TRAITS(os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 32}, {1, 32}}, {{0, 32}, {1, 32}}), // NOLINT + FMT_TRAITS(os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{0, 32}, {1, 32}}, {{0, 32}, {1, 32}}), // NOLINT FMT_TRAITS(os_is_yx_osv16_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 16}, {1, 4}}, {{0, 16}, {1, 4}}), // NOLINT FMT_TRAITS(os_is_yx_osv8_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 8}, {1, 4}}, {{0, 8}, {1, 4}}), // NOLINT FMT_TRAITS(os_is_zyx_osv16_isv16, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{0, 16}, {1, 16}}, {{0, 16}, {1, 16}}), // NOLINT diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index 04c0cf7b79fc0a..8c35c8774250a7 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -9110,6 +9110,7 @@ class convolution_scale_random_test : public convolution_random_test_base {}; +class convolution_random_fsv32_test : public testing::TestWithParam {}; using convolution_random_test_s8s8f32 = convolution_random_test_base; using convolution_random_test_u8s8f32 = convolution_random_test_base; @@ -9238,6 +9239,32 @@ struct params_generator : std::vector { return *this; } + params_generator& forced_mmad_b_fs_yx_fsv32_test_params(format::type input_format, + bool asymm_weights = false, + bool asymm_data = false, + bool padded_input = false, + bool bigger_pad = false) { + std::vector batches = { 1, 2 }; + for (auto b : batches) { + // 3x3 + push_back(convolution_random_test_all_params{ + b, 32, 48, { 14, 14 }, { 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false }); + push_back(convolution_random_test_all_params{ + b, 32, 48, { 14, 14 }, { 3, 3 }, { 2, 2 }, { 1, 1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false }); + // 1x1 + push_back(convolution_random_test_all_params{ + b, 32, 48, { 28, 28 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false }); + push_back(convolution_random_test_all_params{ + b, 32, 48, { 28, 28 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false }); + // 5x5 + push_back(convolution_random_test_all_params{ + b, 32, 48, { 28, 28 }, { 5, 5 }, { 1, 1 }, { 2, 2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false }); + push_back(convolution_random_test_all_params{ + b, 32, 48, { 28, 28 }, { 5, 5 }, { 2, 2 }, { 2, 2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false }); + } + return *this; + } + params_generator& all_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, @@ -9305,6 +9332,28 @@ INSTANTIATE_TEST_SUITE_P( to_string_convolution_all_params ); +TEST_P(convolution_random_fsv32_test, u8u8f32_forced_mmad_b_fs_yx_fsv32) { + convolution_random_test_u8u8f32 test; + const auto p = GetParam(); + ov::intel_gpu::ImplForcingMap force_map{ { "conv", { p.input_format, "convolution_gpu_mmad_b_fs_yx_fsv32" } } }; + test.set_forced_impl(force_map); + ASSERT_NO_FATAL_FAILURE(test.run_random(p)); +} + +INSTANTIATE_TEST_SUITE_P( + basic, + convolution_random_fsv32_test, + testing::ValuesIn( + params_generator() + .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32) + .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, true, true) + .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, false, true) + .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, true, false) + .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, false, false, true) + ), + to_string_convolution_all_params +); + class convolution_random_all_test : public testing::TestWithParam {}; TEST_P(convolution_random_all_test, u8s8f32) { @@ -11741,6 +11790,7 @@ struct conv_dyn_3d_test_params { }; class conv_dyn_3d_test : public testing::TestWithParam {}; +class conv_3d_test_mmad : public testing::TestWithParam {}; TEST_P(conv_dyn_3d_test, convolution_gpu_b_fs_zyx_fsv16_imad_quantized) { auto& engine = get_test_engine(); @@ -11866,6 +11916,11 @@ INSTANTIATE_TEST_SUITE_P(smoke, conv_dyn_3d_test, { ov::Shape{1, 16, 5, 5, 5}, ov::Shape{16, 1, 1, 3, 3, 3}, ov::Strides{1, 1, 1}, ov::Strides{1, 1, 1}, ov::CoordinateDiff{0, 0, 0}, ov::CoordinateDiff{0, 0, 0}, 16, false } })); +INSTANTIATE_TEST_SUITE_P(smoke, conv_3d_test_mmad, + testing::ValuesIn(std::vector{ + { ov::Shape{32, 32, 5, 5, 5}, ov::Shape{32, 32, 3, 3, 3}, ov::Strides{1, 1, 1}, ov::Strides{1, 1, 1}, ov::CoordinateDiff{0, 0, 0}, ov::CoordinateDiff{0, 0, 0}, 1, false}, +})); + TEST(group_convolution_f16_fw_gpu, basic_1d_group_convolution) { auto& engine = get_test_engine(); if (!engine.get_device_info().supports_immad) @@ -12159,3 +12214,86 @@ TEST(conv_dyn_test, changed_batch_convolution_test_reorder_cache_mismatch) { } } } + +TEST_P(conv_3d_test_mmad, convolution_gpu_b_fs_zyx_mmad) { + auto& engine = get_test_engine(); + auto p = GetParam(); + + auto calculate_ref = [&](memory::ptr input, memory::ptr weights, + memory::ptr a_zp, memory::ptr compensation, ExecutionConfig config) { + auto in_layout = input->get_layout(); + + topology topology_ref( + input_layout("input", in_layout), + data("weights", weights), + data("a_zp", a_zp), + data("compensation", compensation), + convolution("conv", input_info("input"), "weights", no_bias, "", "a_zp", "compensation", + p.groups, p.stride, p.dilation, p.pad_begin, p.pad_end, false, data_types::f32)); + + network network_ref(engine, topology_ref, config); + network_ref.set_input_data("input", input); + + auto outputs_ref = network_ref.execute(); + + return outputs_ref.at("conv").get_memory(); + }; + + + + auto in_layout = layout{p.in_shape, data_types::u8, format::b_fs_zyx_fsv32}; + auto input = engine.allocate_memory({ p.in_shape, data_types::u8, format::b_fs_zyx_fsv32 }); + auto weights = engine.allocate_memory({p.wei_shape, data_types::i8, format::bfzyx}); + + auto a_zp_shape = ov::Shape(p.in_shape.size(), 1); + a_zp_shape[1] = p.in_shape[1]; + auto a_zp = engine.allocate_memory({ a_zp_shape, data_types::u8, format::bfyx }); + + auto compensation = engine.allocate_memory({ a_zp_shape, data_types::f32, format::bfyx }); + + tests::random_generator rg(GET_SUITE_NAME); + VF input_rnd = rg.generate_random_1d(ov::shape_size(p.in_shape), 0, 10); + VF weights_rnd = rg.generate_random_1d(ov::shape_size(p.wei_shape), -5, 5); + VF a_zp_rnd = rg.generate_random_1d(ov::shape_size(a_zp_shape), 1, 5); + VF compensation_rnd = rg.generate_random_1d(ov::shape_size(a_zp_shape), -5, 5); + + set_values(input, input_rnd); + set_values(weights, weights_rnd); + set_values(a_zp, a_zp_rnd); + set_values(compensation, compensation_rnd); + + topology topology( + input_layout("input", in_layout), + data("weights", weights), + data("a_zp", a_zp), + data("compensation", compensation), + convolution("conv", input_info("input"), "weights", no_bias, "", "a_zp", "compensation", + p.groups, p.stride, p.dilation, p.pad_begin, p.pad_end, false, data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + ov::intel_gpu::ImplementationDesc conv_impl = { format::b_fs_zyx_fsv32, "convolution_gpu_mmad_b_fs_yx_fsv32", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv", conv_impl } })); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::enable_profiling(true)); + + cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), p.is_caching_test); + network->set_input_data("input", input); + + auto inst = network->get_primitive("conv"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + + auto outputs = network->execute(); + + auto output_memory = outputs.at("conv").get_memory(); + + auto output_memory_ref = calculate_ref(input, weights, a_zp, compensation, config); + + cldnn::mem_lock output_ptr(output_memory, get_test_stream()); + cldnn::mem_lock output_ptr_ref(output_memory_ref, get_test_stream()); + + ASSERT_EQ(outputs.at("conv").get_layout(), output_memory_ref->get_layout()); + for (size_t i = 0; i < output_ptr.size(); i++) { + ASSERT_EQ(output_ptr[i], output_ptr_ref[i]); + } +}