diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp
index 04771fbe8ca186..aa60d61f133596 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp
@@ -200,6 +200,8 @@ struct format {
         os_is_zyx_isa8_osv16_isv4,                    ///< format for weights for fully connected MMAD
         os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4,   ///< format for weights for MMAD fsv32 convolution
         os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4,  ///< format for weights for MMAD fsv32 convolution
+        os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2,   ///< format for weights for MMAD fsv32 convolution
+        os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2,   ///< format for weights for MMAD fsv32 convolution
         os_is_zyx_osa4_isa8_osv8_isv4,                ///< format for weights for MMAD fsv32 convolution
         os_is_yx_osa4_isa8_osv8_isv4,                 ///< format for weights for MMAD fsv32 convolution
         os_is_yx_osv16_isv4,                          ///< format for weights for IMAD convolutions
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
index 9833b750caf06c..2adf2c67d3f5ef 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -538,6 +538,10 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
             return kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
         case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
             return kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
+        case format::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2:
+            return kernel_selector::weights_layout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2;
+        case format::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2:
+            return kernel_selector::weights_layout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2;
         case format::os_is_yx_osv16_isv4:
             return kernel_selector::weights_layout::os_is_yx_osv16_isv4;
         case format::os_is_yx_osv32_isv4_swizzled_by_2:
@@ -728,6 +732,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
             return cldnn::format::os_is_zyx_isa8_osv16_isv4;
         case kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
             return cldnn::format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
+        case kernel_selector::weights_layout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2:
+            return cldnn::format::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2;
+        case kernel_selector::weights_layout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2:
+            return cldnn::format::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2;
         case kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
             return cldnn::format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
         case kernel_selector::weights_layout::os_is_yx_osv32_isv4_swizzled_by_2:
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
index 0620115cfab300..af40bf5f68eb60 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
@@ -27,7 +27,9 @@
     #define ACTIVATION_TYPE_VEC float8
     #define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
     #define MMAD MMAD_8x8
+#if SUB_GROUP_SIZE == 8    
     #define BLOCK_WRITE(ptr, val) _sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
+#endif
 #elif OUTPUT_X_BLOCK_SIZE == 4
     #define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
     #define ACCUMULATOR_TYPE_VEC int4
@@ -35,7 +37,9 @@
     #define ACTIVATION_TYPE_VEC float4
     #define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
     #define MMAD MMAD_4x8
+#if SUB_GROUP_SIZE == 8    
     #define BLOCK_WRITE(ptr, val) _sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
+#endif
 #else
 #error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported block size"
 #endif
@@ -50,7 +54,8 @@
     #error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported FILTER_TYPE"
 #endif
 
-__attribute__((reqd_work_group_size(8, OW_GROUP, 1)))
+
+__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, OW_GROUP, 1)))
 REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 KERNEL(convolution_mmad_b_fs_yx_fsv32)(
     __global INPUT0_TYPE* input,
@@ -89,13 +94,16 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
     const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
     const int input_z = z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
 
-    ACCUMULATOR_TYPE_VEC acc[4] = { 0 }; // 4*8 packed channels * OUTPUT_X_BLOCK_SIZE
+    ACCUMULATOR_TYPE_VEC acc[OF_TO_DO] = { 0 }; // OF_TO_DO*8 packed channels * OUTPUT_X_BLOCK_SIZE
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
     ACCUMULATOR_TYPE_VEC acc_assym_weights = 0;
 #endif
 
+#if INPUT0_DIMS == 5
+    const uint input_offset = INPUT0_GET_INDEX(b,0,0,0,0);
+#else //INPUT0_DIMS == 5
     const uint input_offset = INPUT0_GET_INDEX(b,0,0,0);
-
+#endif //INPUT0_DIMS == 5
     uint filter_idx = fg * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * ISV_SIZE * OSV_SIZE * IFM_BLOCKS;
 
     const int input_x_pitch = ISV_SIZE;
@@ -106,12 +114,13 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
 
     for (int icb = 0; icb < IFM_BLOCKS; ++icb) {
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
-        uchar4 m;
-        __attribute__((opencl_unroll_hint(4)))
-        for (int i = 0; i < 4; i++) {
-            m[i] = icb*32 + lid*4 + i < INPUT0_FEATURE_NUM;
+        uchar4 m = { 0, 0, 0, 0 };
+        __attribute__((opencl_unroll_hint(OF_TO_DO)))
+        for (int i = 0; i < OF_TO_DO; i++) {
+            m[i] = icb*32 + lid*OF_TO_DO + i < INPUT0_FEATURE_NUM;
         }
         int mm = as_int(m);
+#if SUB_GROUP_SIZE == 8 
         int8 multiplier = (int8)(sub_group_broadcast(mm, 0),
                                  sub_group_broadcast(mm, 1),
                                  sub_group_broadcast(mm, 2),
@@ -120,6 +129,16 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
                                  sub_group_broadcast(mm, 5),
                                  sub_group_broadcast(mm, 6),
                                  sub_group_broadcast(mm, 7));
+#else //SUB_GROUP_SIZE == 8 
+        int8 multiplier = (int8)(sub_group_broadcast(mm, 0) + (sub_group_broadcast(mm, 1) << 16),
+                                 sub_group_broadcast(mm, 2) + (sub_group_broadcast(mm, 3) << 16),
+                                 sub_group_broadcast(mm, 4) + (sub_group_broadcast(mm, 5) << 16),
+                                 sub_group_broadcast(mm, 6) + (sub_group_broadcast(mm, 7) << 16),
+                                 sub_group_broadcast(mm, 8) + (sub_group_broadcast(mm, 9) << 16),
+                                 sub_group_broadcast(mm, 10) + (sub_group_broadcast(mm, 11) << 16),
+                                 sub_group_broadcast(mm, 12) + (sub_group_broadcast(mm, 13) << 16),
+                                 sub_group_broadcast(mm, 14) + (sub_group_broadcast(mm, 15) << 16));
+#endif
 #endif
 
         __attribute__((opencl_unroll_hint(FILTER_SIZE_Z)))
@@ -142,23 +161,24 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
 
                     int xb = 0;
                     for (; xb < INPUT_LINE_SIZE; xb++) {
-
-                        bool x_cross_fm = input_x + xb < 0 || input_x + xb >= INPUT0_SIZE_X;
-                        if (y_cross_fm || x_cross_fm || z_cross_fm) {
+                        if (lid < 8){
+                            bool x_cross_fm = input_x + xb < 0 || input_x + xb >= INPUT0_SIZE_X;
+                            if (y_cross_fm || x_cross_fm || z_cross_fm) {
 #if ASYMMETRIC_DATA_QUANTIZATION
-                            const int azp_idx = (icb*ISV_SIZE + 4*lid) % ACTIVATIONS_ZERO_POINTS_FEATURE_NUM;
-                            line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, ((const __global uint*)(activations_zp + azp_idx))[0]);
+                                const int azp_idx = (icb*ISV_SIZE + 4*lid) % ACTIVATIONS_ZERO_POINTS_FEATURE_NUM;
+                                line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, ((const __global uint*)(activations_zp + azp_idx))[0]);
 #else
-                            line_cache[xb] = 0;
+                                line_cache[xb] = 0;
 #endif
-                        }
-                        else
-                        {
-                            line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, _sub_group_block_read((const __global uint*)(input + in_addr +
-                                                                          icb * input_fs_pitch +
-                                                                          kd * DILATION_SIZE_Z * input_z_pitch +
-                                                                          kh * DILATION_SIZE_Y * input_y_pitch +
-                                                                          xb * input_x_pitch)));
+                            }
+                            else
+                            {
+                                line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, _sub_group_block_read((const __global uint*)(input + in_addr +
+                                                                            icb * input_fs_pitch +
+                                                                            kd * DILATION_SIZE_Z * input_z_pitch +
+                                                                            kh * DILATION_SIZE_Y * input_y_pitch +
+                                                                            xb * input_x_pitch)));
+                            }
                         }
                     }
                 }
@@ -175,16 +195,23 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
                                      + kh * ISV_SIZE * OSV_SIZE * FILTER_SIZE_X
                                      + kw * ISV_SIZE * OSV_SIZE;
 
+                    
+#if SUB_GROUP_SIZE == 8 
                     MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data0 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE)));
                     MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data1 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE)));
                     MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data2 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE)));
                     MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data3 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE)));
+#else //SUB_GROUP_SIZE == 8 
+                    MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data0 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 0*16*ISV_SIZE)));
+                    MAKE_VECTOR_TYPE(PACKED_WEIGHTS_TYPE, 8) weights_data1 = AS_PACKED_WEIGHTS_TYPE_VEC8(_sub_group_block_read8((const __global uint*)(weights + f_off + 1*16*ISV_SIZE)));
+#endif
 
                     acc[0] = MMAD(src, weights_data0, acc[0]); // 8 elements in 4*lid+0 out channel
                     acc[1] = MMAD(src, weights_data1, acc[1]); // 8 elements in 4*lid+1 out channel
+#if SUB_GROUP_SIZE == 8 
                     acc[2] = MMAD(src, weights_data2, acc[2]); // 8 elements in 4*lid+2 out channel
                     acc[3] = MMAD(src, weights_data3, acc[3]); // 8 elements in 4*lid+3 out channel
-
+#endif
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
                     acc_assym_weights = MMAD(src, multiplier, acc_assym_weights);
 #endif
@@ -198,62 +225,76 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
 #endif
 
 #if OUTPUT_IS_FP
-    MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) dst[4];
+    MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) dst[OF_TO_DO];
 
     for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
 #if BIAS_TERM
-        ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+0]);
-        ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+1]);
-        ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+2]);
-        ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+3]);
+        ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+0]);
+        ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+1]); 
+#if SUB_GROUP_SIZE == 8
+        ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+2]);
+        ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+3]);
+#endif
 #else
         ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]);
         ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]);
+#if SUB_GROUP_SIZE == 8
         ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]);
         ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]);
 #endif
+#endif
 
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
-        const uint idx0 = fg*OSV_SIZE + 4*lid + 0;
-        const uint idx1 = fg*OSV_SIZE + 4*lid + 1;
-        const uint idx2 = fg*OSV_SIZE + 4*lid + 2;
-        const uint idx3 = fg*OSV_SIZE + 4*lid + 3;
 
+        const uint idx0 = fg*OSV_SIZE + OF_TO_DO*lid + 0;
+        const uint idx1 = fg*OSV_SIZE + OF_TO_DO*lid + 1;
+#if SUB_GROUP_SIZE == 8
+        const uint idx2 = fg*OSV_SIZE + OF_TO_DO*lid + 2;
+        const uint idx3 = fg*OSV_SIZE + OF_TO_DO*lid + 3;
+#endif
         res0 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx0]);
         res1 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx1]);
+#if SUB_GROUP_SIZE == 8
         res2 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx2]);
         res3 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx3]);
+#endif
 #endif  // ASYMMETRIC_WEIGHTS_QUANTIZATION
 
 #if ASYMMETRIC_DATA_QUANTIZATION
-        res0 += compensation[fg*OSV_SIZE + 4*lid + 0];
-        res1 += compensation[fg*OSV_SIZE + 4*lid + 1];
-        res2 += compensation[fg*OSV_SIZE + 4*lid + 2];
-        res3 += compensation[fg*OSV_SIZE + 4*lid + 3];
+        res0 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 0];
+        res1 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 1];
+#if SUB_GROUP_SIZE == 8
+        res2 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 2];
+        res3 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 3];
+#endif
 #endif  // ASYMMETRIC_DATA_QUANTIZATION
 
 #if HAS_FUSED_OPS
         { FUSED_OPS_0; dst[0][i] = FUSED_OPS_RESULT_0; };
         { FUSED_OPS_1; dst[1][i] = FUSED_OPS_RESULT_1; };
+#if SUB_GROUP_SIZE == 8
         { FUSED_OPS_2; dst[2][i] = FUSED_OPS_RESULT_2; };
         { FUSED_OPS_3; dst[3][i] = FUSED_OPS_RESULT_3; };
+#endif
 #else
         dst[0][i] = TO_OUTPUT_TYPE(res0);
         dst[1][i] = TO_OUTPUT_TYPE(res1);
+#if SUB_GROUP_SIZE == 8
         dst[2][i] = TO_OUTPUT_TYPE(res2);
         dst[3][i] = TO_OUTPUT_TYPE(res3);
+#endif
 #endif
     }
 
     for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
-        for (int ofm = 0; ofm < 4; ofm++) {
+        for (int ofm = 0; ofm < OF_TO_DO; ofm++) {
 #if OUTPUT_DIMS == 5
-            const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + 4*lid, z, y, x+i);
+            const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + OF_TO_DO*lid, z, y, x+i);
 #elif OUTPUT_DIMS <= 4
-            const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + 4*lid, y, x+i);
+            const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + ofm + OF_TO_DO*lid, y, x+i);
 #endif
             bool full_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + i < OUTPUT_SIZE_X;
-            bool full_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + 4 * lid + ofm < OUTPUT_FEATURE_NUM;
+            bool full_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + OF_TO_DO * lid + ofm < OUTPUT_FEATURE_NUM;
             if (full_x && full_f) {
                 output[dst_index] = dst[ofm][i];
             }
@@ -264,92 +305,113 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
 
     for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
 #if BIAS_TERM
-        ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+0]);
-        ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+1]);
-        ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+2]);
-        ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + 4*lid+3]);
+        ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+0]);
+        ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+1]);
+#if SUB_GROUP_SIZE == 8        
+        ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+2]);
+        ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]) + (ACTIVATION_TYPE)(biases[bias_index + OF_TO_DO*lid+3]);
+#endif
 #else
         ACTIVATION_TYPE res0 = TO_ACTIVATION_TYPE(acc[0][i]);
         ACTIVATION_TYPE res1 = TO_ACTIVATION_TYPE(acc[1][i]);
+#if SUB_GROUP_SIZE == 8
         ACTIVATION_TYPE res2 = TO_ACTIVATION_TYPE(acc[2][i]);
         ACTIVATION_TYPE res3 = TO_ACTIVATION_TYPE(acc[3][i]);
 #endif
+#endif
 
 #if ASYMMETRIC_WEIGHTS_QUANTIZATION
-        const uint idx0 = fg*OSV_SIZE + 4*lid + 0;
-        const uint idx1 = fg*OSV_SIZE + 4*lid + 1;
-        const uint idx2 = fg*OSV_SIZE + 4*lid + 2;
-        const uint idx3 = fg*OSV_SIZE + 4*lid + 3;
-
+        const uint idx0 = fg*OSV_SIZE + OF_TO_DO*lid + 0;
+        const uint idx1 = fg*OSV_SIZE + OF_TO_DO*lid + 1;
+#if SUB_GROUP_SIZE == 8
+        const uint idx2 = fg*OSV_SIZE + OF_TO_DO*lid + 2;
+        const uint idx3 = fg*OSV_SIZE + OF_TO_DO*lid + 3;
+#endif
         res0 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx0]);
         res1 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx1]);
+#if SUB_GROUP_SIZE == 8
         res2 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx2]);
         res3 -= acc_assym_weights[i] * TO_ACCUMULATOR_TYPE(weights_zp[idx3]);
+#endif
 
 #endif  // ASYMMETRIC_WEIGHTS_QUANTIZATION
 
 #if ASYMMETRIC_DATA_QUANTIZATION
-        res0 += compensation[fg*OSV_SIZE + 4*lid + 0];
-        res1 += compensation[fg*OSV_SIZE + 4*lid + 1];
-        res2 += compensation[fg*OSV_SIZE + 4*lid + 2];
-        res3 += compensation[fg*OSV_SIZE + 4*lid + 3];
+        res0 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 0];
+        res1 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 1];
+#if SUB_GROUP_SIZE == 8
+        res2 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 2];
+        res3 += compensation[fg*OSV_SIZE + OF_TO_DO*lid + 3];
+#endif
 #endif  // ASYMMETRIC_DATA_QUANTIZATION
 
-        MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) pack;
+        MAKE_VECTOR_TYPE(OUTPUT_TYPE, OF_TO_DO) pack;
 #if HAS_FUSED_OPS
         { FUSED_OPS_0; pack[0] = FUSED_OPS_RESULT_0; };
         { FUSED_OPS_1; pack[1] = FUSED_OPS_RESULT_1; };
+#if SUB_GROUP_SIZE == 8
         { FUSED_OPS_2; pack[2] = FUSED_OPS_RESULT_2; };
         { FUSED_OPS_3; pack[3] = FUSED_OPS_RESULT_3; };
+#endif
 #else
         pack[0] = TO_OUTPUT_TYPE(res0);
         pack[1] = TO_OUTPUT_TYPE(res1);
+#if SUB_GROUP_SIZE == 8
         pack[2] = TO_OUTPUT_TYPE(res2);
         pack[3] = TO_OUTPUT_TYPE(res3);
+#endif
 #endif
         dst[i] = AS_PACKED_OUT_TYPE(pack);
     }
 
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X;
     const bool full_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || (fg + 1) * OSV_SIZE <= OUTPUT_FEATURE_NUM;
+#if SUB_GROUP_SIZE == 8
     if (full_x && full_f) {
 #if OUTPUT_DIMS == 5
-        const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, z, y, x)) / 4;
+        const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, z, y, x)) / OF_TO_DO;
 #elif OUTPUT_DIMS <= 4
-        const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, y, x)) / 4;
+        const uint dst_index = (OUTPUT_GET_INDEX(b, fg*OSV_SIZE, y, x)) / OF_TO_DO;
 #endif
         BLOCK_WRITE(output + dst_index, dst);
     } else {
-#if OUTPUT_FEATURE_NUM % 4 == 0
+#endif //SUB_GROUP_SIZE == 8
+#if OUTPUT_FEATURE_NUM % OF_TO_DO == 0
         for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
             const bool full_it_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + i < OUTPUT_SIZE_X;
-            const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + 4 * lid < OUTPUT_FEATURE_NUM;
+            const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + OF_TO_DO * lid < OUTPUT_FEATURE_NUM;
             if (full_it_x && full_sgl_f) {
 #   if OUTPUT_DIMS == 5
-                const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid, z, y, x+i);
+                const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid, z, y, x+i);
 #   elif OUTPUT_DIMS <= 4
-                const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid, y, x+i);
+                const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid, y, x+i);
 #   endif
-                output[dst_index/4] = dst[i];
+                output[dst_index/OF_TO_DO] = dst[i];
             }
         }
-#else  // OUTPUT_FEATURE_NUM % 4 == 0
+#else  // OUTPUT_FEATURE_NUM % OF_TO_DO == 0
         for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
-            for (int ofm = 0; ofm < 4; ++ofm) {
+            for (int ofm = 0; ofm < OF_TO_DO; ++ofm) {
                 const bool full_it_x = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0 || x + i < OUTPUT_SIZE_X;
-                const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + 4 * lid + ofm < OUTPUT_FEATURE_NUM;
+                const bool full_sgl_f = OUTPUT_FEATURE_NUM % OSV_SIZE == 0 || fg * OSV_SIZE + OF_TO_DO * lid + ofm < OUTPUT_FEATURE_NUM;
                 if (full_it_x && full_sgl_f) {
 #   if OUTPUT_DIMS == 5
-                    const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid + ofm, z, y, x+i);
+                    const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid + ofm, z, y, x+i);
 #   elif OUTPUT_DIMS <= 4
-                    const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + 4*lid + ofm, y, x+i);
+                    const uint dst_index = OUTPUT_GET_INDEX(b, fg*OSV_SIZE + OF_TO_DO*lid + ofm, y, x+i);
 #   endif
+#if SUB_GROUP_SIZE == 8
                     ((__global uchar*)output)[dst_index] = as_uchar4(dst[i])[ofm];
+#else
+                    ((__global uchar*)output)[dst_index] = as_uchar2(dst[i])[ofm];
+#endif
                 }
             }
         }
 #endif  // OUTPUT_FEATURE_NUM % 4 == 0
+#if SUB_GROUP_SIZE == 8
     }
+#endif //SUB_GROUP_SIZE == 8
 #endif  // OUTPUT_IS_FP
 }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl
index 08d9229a4b109c..3d2449ab7ebf0d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl
@@ -794,6 +794,65 @@ inline uint get_os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4_index(uint o, uint i,
     return idx;
 }
 
+inline uint get_os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2_index(
+    uint o, uint i, uint y, uint x,
+    uint size_x, uint size_y,
+    uint size_ifm, uint size_ofm,
+    uint offset)
+{
+    const uint o_swizzled = (o % 2) * 16 + ((o % 32) / 2) + (o / 32) * 32;
+    const uint isv_idx = i % 4;
+    const uint isa_idx = (i / 4) % 8;
+    const uint is_idx  = i / 32;
+    const uint osv_idx = o_swizzled % 16;
+    const uint osa_idx = (o_swizzled / 16) % 2;
+    const uint os_idx  = o / 32;
+
+    const uint f_32_aligned = (size_ifm + 31) / 32;
+
+    size_t idx = offset +
+                 isv_idx +
+                 osv_idx * 4 +
+                 isa_idx * 16 * 4 +
+                 osa_idx * 16 * 32 +
+                 x * 32 * 32 +
+                 y * size_x * 32 * 32 +
+                 is_idx * 32 * 32 * size_x * size_y +
+                 os_idx * 32 * 32 * f_32_aligned * size_x * size_y;
+
+    return idx;
+}
+
+inline uint get_os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2_index(
+    uint o, uint i, uint z, uint y, uint x,
+    uint size_x, uint size_y, uint size_z,
+    uint size_ifm, uint size_ofm,
+    uint offset)
+{
+    const uint o_swizzled = (o % 2) * 16 + ((o % 32) / 2) + (o / 32) * 32;
+    const uint isv_idx = i % 4;
+    const uint isa_idx = (i / 4) % 8;
+    const uint is_idx  = i / 32;
+    const uint osv_idx = o_swizzled % 16;
+    const uint osa_idx = (o_swizzled / 16) % 2;
+    const uint os_idx  = o / 32;
+
+    const uint f_32_aligned = (size_ifm + 31) / 32;
+
+    size_t idx = offset +
+                 isv_idx +
+                 osv_idx * 4 +
+                 isa_idx * 16 * 4 +
+                 osa_idx * 16 * 32 +
+                 x * 32 * 32 +
+                 y * size_x * 32 * 32 +
+                 z * size_x * size_y * 32 * 32 +
+                 is_idx * 32 * 32 * size_x * size_y * size_z +
+                 os_idx * 32 * 32 * f_32_aligned * size_x * size_y * size_z;
+
+    return idx;
+}
+
 inline uint get_os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4_index(uint o, uint i, uint z, uint y, uint x,
                                                                         uint size_x, uint size_y, uint size_z,
                                                                         uint size_ifm, uint size_ofm, uint offset)
@@ -985,6 +1044,25 @@ inline uint get_g_is_os_yx_isa4_osa8_isv8_osv4(uint g, uint o, uint i, uint z, u
         CAT(prefix, _OFM_NUM),                                                              \
         CAT(prefix, _OFFSET))
 
+#define GET_FILTER_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(prefix, o, i, y, x) \
+    get_os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2_index(                               \
+        o, i, y, x,                                                                     \
+        CAT(prefix, _SIZE_X),                                                           \
+        CAT(prefix, _SIZE_Y),                                                           \
+        CAT(prefix, _IFM_NUM),                                                          \
+        CAT(prefix, _OFM_NUM),                                                          \
+        CAT(prefix, _OFFSET))
+
+#define GET_FILTER_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(prefix, o, i, z, y, x) \
+    get_os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2_index(                               \
+        o, i, z, y, x,                                                                     \
+        CAT(prefix, _SIZE_X),                                                           \
+        CAT(prefix, _SIZE_Y),                                                           \
+        CAT(prefix, _SIZE_Z),                                                           \
+        CAT(prefix, _IFM_NUM),                                                          \
+        CAT(prefix, _OFM_NUM),                                                          \
+        CAT(prefix, _OFFSET))
+
 inline uint get_is_o32_yx_isv32_swizzled_by_4_index(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size)
 {
     const uint o_aligned_to_32 = ((o_size + 31) / 32) * 32;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl
index 900bae7fe41815..9d234456185a69 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl
@@ -431,6 +431,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
     return GET_FILTER_OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, y, x);
 #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4
     return GET_FILTER_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, z, y, x);
+#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2
+    return GET_FILTER_OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(OUTPUT, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2
+    return GET_FILTER_OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2_INDEX(OUTPUT, o, i, z, y, x);
 #elif defined OUTPUT_LAYOUT_OS_IS_YX_ISV16_OSV16
     return GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(OUTPUT, o, i, y, x, 16, 16);
 #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISV16_OSV16
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp
index f6c1735b46df77..5fdc58556d3f6b 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp
@@ -347,6 +347,8 @@ std::string toString(WeightsLayout layout) {
         case WeightsLayout::os_is_zyx_isa8_osv16_isv4:                   return "OS_IS_ZYX_ISA8_OSV16_ISV4";
         case WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:  return "OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
         case WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
+        case WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2: return "OS_IS_YX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2";
+        case WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2: return "OS_IS_ZYX_OSA2_ISA8_OSV16_ISV4_SWIZZLED_BY_2";
         case WeightsLayout::os_is_yx_osv16_isv4:                         return "OS_IS_YX_OSV16_ISV4";
         case WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2:           return "OS_IS_YX_OSV32_ISV4_SWIZZLED_BY_2";
         case WeightsLayout::os_is_yx_osv32_isv4:                         return "OS_IS_YX_OSV32_ISV4";
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp
index b24cc863f844b3..5c75e73f981434 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp
@@ -410,8 +410,8 @@ Datatype ConvolutionKernelBase::GetPackedInputType(const convolution_params& par
     return GetPackedType(params.inputs[0].GetDType());
 }
 
-Datatype ConvolutionKernelBase::GetPackedOutputType(const convolution_params& params) const {
-    return GetPackedType(params.outputs[0].GetDType());
+Datatype ConvolutionKernelBase::GetPackedOutputType(const convolution_params& params, size_t pack_size) const {
+    return GetPackedType(params.outputs[0].GetDType(), pack_size);
 }
 
 Datatype ConvolutionKernelBase::GetActivationType(const convolution_params& params) const {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h
index 8e4806d2484b44..66aa127fd467e4 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.h
@@ -77,7 +77,7 @@ class ConvolutionKernelBase : public WeightBiasKernelBase {
 
     Datatype GetPackedType(Datatype dt, size_t pack_size = 4) const;
     Datatype GetPackedInputType(const convolution_params& params) const;
-    Datatype GetPackedOutputType(const convolution_params& params) const;
+    Datatype GetPackedOutputType(const convolution_params& params, size_t pack_size = 4) const;
     Datatype GetActivationType(const convolution_params& params) const;
     Datatype GetAccumulatorType(const convolution_params& params) const;
     void GetUpdateDispatchDataFunc(KernelData& kd) const override;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp
index 4627336aad8465..bf63e8ffd30f20 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp
@@ -63,7 +63,7 @@ bool ConvolutionKernel_mmad_b_fs_yx_fsv32::Validate(const Params& p) const {
         DO_NOT_USE_THIS_KERNEL(p.layerID);
     }
 
-    if (!IsSIMDSizeSupported(params.engineInfo, 8))
+    if (!IsSIMDSizeSupported(params.engineInfo, 8) && !IsSIMDSizeSupported(params.engineInfo, 16))
         DO_NOT_USE_THIS_KERNEL(p.layerID);
 
     if (params.groups > 1)
@@ -109,12 +109,18 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_b_fs_yx_fsv32::SetDef
             break;
         ow_group--;
     }
-
-    dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 4;
+    if (IsSIMDSizeSupported(cp.engineInfo, 8)) {
+        dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 4;
+    } else {
+        dispatchData.gws[0] = Align(cp.outputs[0].Feature().v, 32) / 2;
+    }
     dispatchData.gws[1] = Align(CeilDiv(cp.outputs[0].X().v, dispatchData.cldnnStyle.blockWidth), ow_group) * cp.outputs[0].Y().v * cp.outputs[0].Z().v;
     dispatchData.gws[2] = cp.outputs[0].Batch().v;
-
-    dispatchData.lws[0] = 8;
+    if (IsSIMDSizeSupported(cp.engineInfo, 8)) {
+        dispatchData.lws[0] = 8;
+    } else {
+        dispatchData.lws[0] = 16;
+    }
     dispatchData.lws[1] = ow_group;
     dispatchData.lws[2] = 1;
 
@@ -144,7 +150,13 @@ JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32::GetJitConstants(const convolu
     jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
 
     jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED_IN"));
-    jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params), "PACKED_OUT"));
+    if (IsSIMDSizeSupported(params.engineInfo, 8)) {
+        jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params), "PACKED_OUT"));
+        jit.AddConstant(MakeJitConstant("OF_TO_DO", 4));
+    } else {
+        jit.Merge(MakeTypeJitConstants(GetPackedOutputType(params, 2), "PACKED_OUT"));
+        jit.AddConstant(MakeJitConstant("OF_TO_DO", 2));
+    }
     if (params.weights.GetDType() == WeightsType::INT8) {
         jit.AddConstant(MakeJitConstant("FILTER_TYPE_CHAR", 1));
     } else if (params.weights.GetDType() == WeightsType::UINT8) {
@@ -159,22 +171,26 @@ JitConstants ConvolutionKernel_mmad_b_fs_yx_fsv32::GetJitConstants(const convolu
         std::vector<std::string> idx_order2;
         std::vector<std::string> idx_order3;
         if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 4) {
-            idx_order0 = {"b", "(fg*32 + 4*lid+0)", "y", "(x+i)"};
-            idx_order1 = {"b", "(fg*32 + 4*lid+1)", "y", "(x+i)"};
-            idx_order2 = {"b", "(fg*32 + 4*lid+2)", "y", "(x+i)"};
-            idx_order3 = {"b", "(fg*32 + 4*lid+3)", "y", "(x+i)"};
+            idx_order0 = {"b", "(fg*32 + OF_TO_DO*lid+0)", "y", "(x+i)"};
+            idx_order1 = {"b", "(fg*32 + OF_TO_DO*lid+1)", "y", "(x+i)"};
+            idx_order2 = {"b", "(fg*32 + OF_TO_DO*lid+2)", "y", "(x+i)"};
+            idx_order3 = {"b", "(fg*32 + OF_TO_DO*lid+3)", "y", "(x+i)"};
         } else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 5) {
-            idx_order0 = {"b", "(fg*32 + 4*lid+0)", "z", "y", "(x+i)"};
-            idx_order1 = {"b", "(fg*32 + 4*lid+1)", "z", "y", "(x+i)"};
-            idx_order2 = {"b", "(fg*32 + 4*lid+2)", "z", "y", "(x+i)"};
-            idx_order3 = {"b", "(fg*32 + 4*lid+3)", "z", "y", "(x+i)"};
+            idx_order0 = {"b", "(fg*32 + OF_TO_DO*lid+0)", "z", "y", "(x+i)"};
+            idx_order1 = {"b", "(fg*32 + OF_TO_DO*lid+1)", "z", "y", "(x+i)"};
+            idx_order2 = {"b", "(fg*32 + OF_TO_DO*lid+2)", "z", "y", "(x+i)"};
+            idx_order3 = {"b", "(fg*32 + OF_TO_DO*lid+3)", "z", "y", "(x+i)"};
         }
 
         FusedOpsConfiguration conf0 = {"_0", idx_order0, "res0", input_dt, 1 };
         FusedOpsConfiguration conf1 = {"_1", idx_order1, "res1", input_dt, 1 };
         FusedOpsConfiguration conf2 = {"_2", idx_order2, "res2", input_dt, 1 };
         FusedOpsConfiguration conf3 = {"_3", idx_order3, "res3", input_dt, 1 };
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1, conf2, conf3}));
+        if (IsSIMDSizeSupported(params.engineInfo, 8)) {
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1, conf2, conf3}));
+        } else {
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf0, conf1}));
+        }
     }
 
     return jit;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h
index 212298fc9f5410..51ff8efed783e5 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.h
@@ -29,10 +29,18 @@ class ConvolutionKernel_mmad_b_fs_yx_fsv32 : public ConvolutionKernelBase {
     bool NeedPaddedInput() const override { return false; }
 
     WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override {
-        if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) {
-            return WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
+         if (IsSIMDSizeSupported(p.engineInfo, 8)) {
+            if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) {
+                return WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
+            } else {
+                return WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
+            }
         } else {
-            return WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4;
+            if (DataTensor::ChannelsCount(p.outputs[0].GetLayout()) <= 4) {
+                return WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2;
+            } else {
+                return WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2;
+            }
         }
     }
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp
index aa2568c0367c95..3b247ddd9c8421 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp
@@ -114,6 +114,8 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
     { WeightsLayout::os_is_zyx_isa8_osv16_isv4,                   {  0,  1,  2,   3,   4, -1 } },
     { WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4,  {  0,  1, -1,   2,   3, -1 } },
     { WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, {  0,  1,  2,   3,   4, -1 } },
+    { WeightsLayout::os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2,  {  0,  1, -1,   2,   3, -1 } },
+    { WeightsLayout::os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2, {  0,  1,  2,   3,   4, -1 } },
     { WeightsLayout::os_is_yx_osv8_isv4,                          {  0,  1, -1,   2,   3, -1 } },
     { WeightsLayout::os_is_yx_osv16_isv4,                         {  0,  1, -1,   2,   3, -1 } },
     { WeightsLayout::os_is_yx_osv32_isv4_swizzled_by_2,           {  0,  1, -1,   2,   3, -1 } },
@@ -629,11 +631,13 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
             newDims[4] = RoundUp(newDims[4], 16);
             break;
         case os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
+        case os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2:
             assert(newDims.size() == 4);
             newDims[3] = RoundUp(newDims[3], 32);
             newDims[2] = RoundUp(newDims[2], 32);
             break;
         case os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
+        case os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2:
             assert(newDims.size() == 5);
             newDims[4] = RoundUp(newDims[4], 32);
             newDims[3] = RoundUp(newDims[3], 32);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h
index e272cad2eaf654..3a080fe3606b33 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h
@@ -140,6 +140,8 @@ enum WeightsLayout {
                                                  // 1,5...
     os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4,  // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28,
                                                   // 1,5...
+    os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2,
+    os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2,
     os_is_yx_osv16_isv4,                 // swizzled weights for convolution using IMAD
     os_is_yx_osv8_isv4,                      // weights for int8 blocked conv
     os_is_yx_osv32_isv4_swizzled_by_2,   //  weights for bfyx -> b_fs_yx_fsv32 convolution using IMAD with swizzled ofm (0, 2, 4..), (1, 3, 5...)
diff --git a/src/plugins/intel_gpu/src/runtime/format.cpp b/src/plugins/intel_gpu/src/runtime/format.cpp
index 7601897e1e1a7d..95a03f7e060554 100644
--- a/src/plugins/intel_gpu/src/runtime/format.cpp
+++ b/src/plugins/intel_gpu/src/runtime/format.cpp
@@ -107,6 +107,8 @@ static const std::map<format::type, format_traits> format_traits_map {
         FMT_TRAITS(os_is_zyx_isa8_osv16_isv4,                    1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx",  "oixyz", {{1, 8}, {0, 16}, {1, 4}},         {{1, 8}, {0, 16}, {1, 4}}),  // NOLINT
         FMT_TRAITS(os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4,   1, 1, 2, 0, {0, 1, 2, 3},    "oiyx",   "oixy",  {{0, 32}, {1, 32}},                {{0, 32}, {1, 32}}),  // NOLINT
         FMT_TRAITS(os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4,  1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx",  "oixyz", {{0, 32}, {1, 32}},                {{0, 32}, {1, 32}}),  // NOLINT
+        FMT_TRAITS(os_is_yx_osa2_isa8_osv16_isv4_swizzled_by_2,   1, 1, 2, 0, {0, 1, 2, 3},    "oiyx",   "oixy",  {{0, 32}, {1, 32}},                {{0, 32}, {1, 32}}),  // NOLINT
+        FMT_TRAITS(os_is_zyx_osa2_isa8_osv16_isv4_swizzled_by_2,   1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx",  "oixyz", {{0, 32}, {1, 32}},                {{0, 32}, {1, 32}}),  // NOLINT
         FMT_TRAITS(os_is_yx_osv16_isv4,                          1, 1, 2, 0, {0, 1, 2, 3},    "oiyx",   "oixy",  {{0, 16}, {1, 4}},                 {{0, 16}, {1, 4}}),  // NOLINT
         FMT_TRAITS(os_is_yx_osv8_isv4,                           1, 1, 2, 0, {0, 1, 2, 3},    "oiyx",   "oixy",  {{0, 8}, {1, 4}},                  {{0, 8}, {1, 4}}),  // NOLINT
         FMT_TRAITS(os_is_zyx_osv16_isv16,                        1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx",  "oixyz", {{0, 16}, {1, 16}},                {{0, 16}, {1, 16}}),  // NOLINT
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
index 04c0cf7b79fc0a..8c35c8774250a7 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
@@ -9110,6 +9110,7 @@ class convolution_scale_random_test : public convolution_random_test_base<InputT
 };
 
 class convolution_random_smoke_test : public testing::TestWithParam<convolution_random_test_all_params> {};
+class convolution_random_fsv32_test : public testing::TestWithParam<convolution_random_test_all_params> {};
 
 using convolution_random_test_s8s8f32 = convolution_random_test_base<int8_t, int8_t, float>;
 using convolution_random_test_u8s8f32 = convolution_random_test_base<uint8_t, int8_t, float>;
@@ -9238,6 +9239,32 @@ struct params_generator : std::vector<convolution_random_test_all_params> {
         return *this;
     }
 
+    params_generator& forced_mmad_b_fs_yx_fsv32_test_params(format::type input_format,
+                                        bool asymm_weights = false,
+                                        bool asymm_data = false,
+                                        bool padded_input = false,
+                                        bool bigger_pad = false) {
+        std::vector<size_t> batches = { 1, 2 };
+        for (auto b : batches) {
+            // 3x3
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 14, 14 }, { 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false });
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 14, 14 }, { 3, 3 }, { 2, 2 }, { 1, 1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false });
+            // 1x1
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false });
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false });
+            // 5x5
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 5, 5 }, { 1, 1 }, { 2, 2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false });
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 5, 5 }, { 2, 2 }, { 2, 2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input, bigger_pad, false });
+        }
+        return *this;
+    }
+
     params_generator& all_test_params(format::type input_format,
                                       bool asymm_weights = false,
                                       bool asymm_data = false,
@@ -9305,6 +9332,28 @@ INSTANTIATE_TEST_SUITE_P(
     to_string_convolution_all_params
 );
 
+TEST_P(convolution_random_fsv32_test, u8u8f32_forced_mmad_b_fs_yx_fsv32) {
+    convolution_random_test_u8u8f32 test;
+    const auto p = GetParam();
+    ov::intel_gpu::ImplForcingMap force_map{ { "conv", { p.input_format, "convolution_gpu_mmad_b_fs_yx_fsv32" } } };
+    test.set_forced_impl(force_map);
+    ASSERT_NO_FATAL_FAILURE(test.run_random(p));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    basic,
+    convolution_random_fsv32_test,
+    testing::ValuesIn(
+        params_generator()
+        .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32)
+        .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, true, true)
+        .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, false, true)
+        .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, true, false)
+        .forced_mmad_b_fs_yx_fsv32_test_params(format::b_fs_yx_fsv32, false, false, true)
+    ),
+    to_string_convolution_all_params
+);
+
 class convolution_random_all_test : public testing::TestWithParam<convolution_random_test_all_params> {};
 
 TEST_P(convolution_random_all_test, u8s8f32) {
@@ -11741,6 +11790,7 @@ struct conv_dyn_3d_test_params {
 };
 
 class conv_dyn_3d_test : public testing::TestWithParam<conv_dyn_3d_test_params> {};
+class conv_3d_test_mmad : public testing::TestWithParam<conv_dyn_3d_test_params> {};
 
 TEST_P(conv_dyn_3d_test, convolution_gpu_b_fs_zyx_fsv16_imad_quantized) {
     auto& engine = get_test_engine();
@@ -11866,6 +11916,11 @@ INSTANTIATE_TEST_SUITE_P(smoke, conv_dyn_3d_test,
     { ov::Shape{1, 16, 5, 5, 5}, ov::Shape{16, 1, 1, 3, 3, 3}, ov::Strides{1, 1, 1}, ov::Strides{1, 1, 1}, ov::CoordinateDiff{0, 0, 0}, ov::CoordinateDiff{0, 0, 0}, 16, false }
 }));
 
+INSTANTIATE_TEST_SUITE_P(smoke, conv_3d_test_mmad,
+    testing::ValuesIn(std::vector<conv_dyn_3d_test_params>{
+    { ov::Shape{32, 32, 5, 5, 5}, ov::Shape{32, 32, 3, 3, 3}, ov::Strides{1, 1, 1}, ov::Strides{1, 1, 1}, ov::CoordinateDiff{0, 0, 0}, ov::CoordinateDiff{0, 0, 0}, 1, false},
+}));
+
 TEST(group_convolution_f16_fw_gpu, basic_1d_group_convolution) {
     auto& engine = get_test_engine();
     if (!engine.get_device_info().supports_immad)
@@ -12159,3 +12214,86 @@ TEST(conv_dyn_test, changed_batch_convolution_test_reorder_cache_mismatch) {
         }
     }
 }
+
+TEST_P(conv_3d_test_mmad, convolution_gpu_b_fs_zyx_mmad) {
+    auto& engine = get_test_engine();
+    auto p = GetParam();
+
+    auto calculate_ref = [&](memory::ptr input, memory::ptr weights,
+                             memory::ptr a_zp, memory::ptr compensation, ExecutionConfig config) {
+        auto in_layout = input->get_layout();
+
+        topology topology_ref(
+            input_layout("input", in_layout),
+            data("weights", weights),
+            data("a_zp", a_zp),
+            data("compensation", compensation),
+            convolution("conv", input_info("input"), "weights", no_bias, "", "a_zp", "compensation",
+                        p.groups, p.stride, p.dilation, p.pad_begin, p.pad_end, false, data_types::f32));
+
+        network network_ref(engine, topology_ref, config);
+        network_ref.set_input_data("input", input);
+
+        auto outputs_ref = network_ref.execute();
+
+        return outputs_ref.at("conv").get_memory();
+    };
+
+
+
+    auto in_layout = layout{p.in_shape, data_types::u8, format::b_fs_zyx_fsv32};
+    auto input = engine.allocate_memory({ p.in_shape, data_types::u8, format::b_fs_zyx_fsv32 });
+    auto weights = engine.allocate_memory({p.wei_shape, data_types::i8, format::bfzyx});
+
+    auto a_zp_shape = ov::Shape(p.in_shape.size(), 1);
+    a_zp_shape[1] = p.in_shape[1];
+    auto a_zp = engine.allocate_memory({ a_zp_shape, data_types::u8, format::bfyx });
+
+    auto compensation = engine.allocate_memory({ a_zp_shape, data_types::f32, format::bfyx });
+
+    tests::random_generator rg(GET_SUITE_NAME);
+    VF<uint8_t> input_rnd = rg.generate_random_1d<uint8_t>(ov::shape_size(p.in_shape), 0, 10);
+    VF<int8_t> weights_rnd = rg.generate_random_1d<int8_t>(ov::shape_size(p.wei_shape), -5, 5);
+    VF<uint8_t> a_zp_rnd = rg.generate_random_1d<uint8_t>(ov::shape_size(a_zp_shape), 1, 5);
+    VF<float> compensation_rnd = rg.generate_random_1d<float>(ov::shape_size(a_zp_shape), -5, 5);
+
+    set_values(input, input_rnd);
+    set_values(weights, weights_rnd);
+    set_values(a_zp, a_zp_rnd);
+    set_values(compensation, compensation_rnd);
+
+    topology topology(
+        input_layout("input", in_layout),
+        data("weights", weights),
+        data("a_zp", a_zp),
+        data("compensation", compensation),
+        convolution("conv", input_info("input"), "weights", no_bias, "", "a_zp", "compensation",
+                    p.groups, p.stride, p.dilation, p.pad_begin, p.pad_end, false, data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    ov::intel_gpu::ImplementationDesc conv_impl = { format::b_fs_zyx_fsv32, "convolution_gpu_mmad_b_fs_yx_fsv32", impl_types::ocl };
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv", conv_impl } }));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::enable_profiling(true));
+
+    cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), p.is_caching_test);
+    network->set_input_data("input", input);
+
+    auto inst = network->get_primitive("conv");
+    auto impl = inst->get_impl();
+    ASSERT_TRUE(impl != nullptr);
+
+    auto outputs = network->execute();
+
+    auto output_memory = outputs.at("conv").get_memory();
+
+    auto output_memory_ref = calculate_ref(input, weights, a_zp, compensation, config);
+
+    cldnn::mem_lock<float> output_ptr(output_memory, get_test_stream());
+    cldnn::mem_lock<float> output_ptr_ref(output_memory_ref, get_test_stream());
+
+    ASSERT_EQ(outputs.at("conv").get_layout(), output_memory_ref->get_layout());
+    for (size_t i = 0; i < output_ptr.size(); i++) {
+        ASSERT_EQ(output_ptr[i], output_ptr_ref[i]);
+    }
+}