Skip to content

Commit c7c101f

Browse files
authored
[GPU] Add int8 fsv32 opt concat kernel (#34624)
### Description of the issue(symptom, root-cause, how it was resolved) #### Symptom The YOLO26n INT8 model shows a significant performance bottleneck on GPU. Profiling revealed that feature-axis concatenation layers in [b_fs_yx_fsv32](vscode-file://vscode-app/c:/Users/hyunback/AppData/Local/Programs/Microsoft%20VS%20Code/c3a26841a8/resources/app/out/vs/code/electron-browser/workbench/workbench.html) format were falling back to the reference kernel, causing high latency. #### Root cause There was no optimized concatenation kernel for [b_fs_yx_fsv32](vscode-file://vscode-app/c:/Users/hyunback/AppData/Local/Programs/Microsoft%20VS%20Code/c3a26841a8/resources/app/out/vs/code/electron-browser/workbench/workbench.html) format. When the DPAS pipeline produces INT8 tensors in this format with feature counts not aligned to 32, every concat operation dispatches the generic reference kernel, which processes elements one by one without leveraging sub-group block I/O. #### Resolution Added a new optimized concatenation kernel (concatenation_gpu_b_fs_yx_fsv32) for feature-axis concat in [b_fs_yx_fsv32](vscode-file://vscode-app/c:/Users/hyunback/AppData/Local/Programs/Microsoft%20VS%20Code/c3a26841a8/resources/app/out/vs/code/electron-browser/workbench/workbench.html) format, supporting INT8/UINT8/FP16 data types. The kernel handles two cases based on the cumulative output offset alignment: Aligned (output offset % 32 == 0): Uses block_read2/block_write2 for maximum throughput. A runtime guard handles the last partial feature block with scalar writes. Unaligned (output offset % 32 != 0): Uses block_read2 for input + per-element scalar writes with OUTPUT_GET_INDEX to correctly address features that span across different FSV32 slices in memory. #### Reproduction step and snapshot (if applicable. Do not attach for customer model) $ ./benchmark_app -m ./yolo26n_int8_openvino_model/yolo26n.xml -d GPU -t 10 #### Checklist - [x] Is it a proper fix? - [x] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? Which test did you review? ### Tickets: - *CVS-182396* Signed-off-by: hyunback <hyunback.kim@intel.com>
1 parent 4188f16 commit c7c101f

File tree

5 files changed

+368
-0
lines changed

5 files changed

+368
-0
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Copyright (C) 2018-2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "include/batch_headers/fetch_data.cl"
6+
#include "include/batch_headers/sub_group_block_read.cl"
7+
#include "include/batch_headers/sub_group_block_write.cl"
8+
9+
// ======================================================================================
10+
// Optimized concatenation kernel for b_fs_yx_fsv32 format (feature-axis concat)
11+
// Supports INT8/UINT8/FP16 data types.
12+
//
13+
// Required JIT definitions:
14+
// --------------------------------------------------------------------------------------
15+
// SUB_GROUP_SIZE - [int] sub-group/simd size; limited to 16
16+
// FSV - [int] feature slice size; limited to 32
17+
// FSV_PER_THREAD - [int] number of features per thread = FSV / SUB_GROUP_SIZE
18+
// ALIGNED - [0/1] whether the output offset is aligned to FSV
19+
// ======================================================================================
20+
21+
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
22+
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
23+
KERNEL(concatenation_gpu_b_fs_yx_fsv32)(
24+
__global INPUT0_TYPE* input,
25+
__global OUTPUT_TYPE* output,
26+
uint output_offset_in_concat_axis)
27+
{
28+
const uint x = (uint)get_global_id(0);
29+
const uint y = (uint)get_global_id(1);
30+
const uint fs_b_id = get_group_id(2);
31+
const uint sglid = get_sub_group_local_id();
32+
33+
const uint fs = fs_b_id / INPUT0_BATCH_NUM;
34+
const uint b = fs_b_id - fs * INPUT0_BATCH_NUM;
35+
36+
const uint input_offset = INPUT0_GET_INDEX(b, fs * FSV, y, x);
37+
38+
MAKE_VECTOR_TYPE(INPUT0_TYPE, 2) in = DT_INPUT_BLOCK_READ2(input, input_offset);
39+
40+
in = ACTIVATION(in, ACTIVATION_PARAMS);
41+
42+
#if ALIGNED
43+
const uint dst_index = OUTPUT_GET_INDEX(b, output_offset_in_concat_axis + fs * FSV, y, x);
44+
45+
// Full feature block: use block write for maximum throughput
46+
if (fs * FSV + FSV <= INPUT0_FEATURE_NUM) {
47+
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, in);
48+
} else {
49+
// Last partial feature block: write only valid features
50+
if (sglid + fs * FSV < INPUT0_FEATURE_NUM) {
51+
output[dst_index + sglid] = in.s0;
52+
}
53+
if (sglid + SUB_GROUP_SIZE + fs * FSV < INPUT0_FEATURE_NUM) {
54+
output[dst_index + SUB_GROUP_SIZE + sglid] = in.s1;
55+
}
56+
}
57+
#else
58+
// Unaligned case: use per-element writes with proper index computation
59+
const uint dst_feature = fs * FSV + output_offset_in_concat_axis + sglid;
60+
61+
if (sglid + SUB_GROUP_SIZE + fs * FSV < INPUT0_FEATURE_NUM) {
62+
output[OUTPUT_GET_INDEX(b, dst_feature, y, x)] = in.s0;
63+
output[OUTPUT_GET_INDEX(b, dst_feature + SUB_GROUP_SIZE, y, x)] = in.s1;
64+
} else if (sglid + fs * FSV < INPUT0_FEATURE_NUM) {
65+
output[OUTPUT_GET_INDEX(b, dst_feature, y, x)] = in.s0;
66+
}
67+
#endif
68+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
// Copyright (C) 2018-2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include <algorithm>
6+
#include "concatenation_kernel_b_fs_yx_fsv32.h"
7+
#include "kernel_selector_utils.h"
8+
9+
namespace kernel_selector {
10+
11+
static constexpr size_t subGroupSize = 16;
12+
static constexpr size_t fsv = 32;
13+
static constexpr size_t fsvPerThread = fsv / subGroupSize;
14+
15+
ParamsKey ConcatenationKernel_b_fs_yx_fsv32::GetSupportedKey() const {
16+
ParamsKey k;
17+
k.EnableInputDataType(Datatype::INT8);
18+
k.EnableOutputDataType(Datatype::INT8);
19+
k.EnableInputDataType(Datatype::UINT8);
20+
k.EnableOutputDataType(Datatype::UINT8);
21+
k.EnableInputDataType(Datatype::F16);
22+
k.EnableOutputDataType(Datatype::F16);
23+
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
24+
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
25+
k.EnableTensorOffset();
26+
k.EnableTensorPitches();
27+
k.EnableBatching();
28+
k.EnableConcatAxis(ConcatAxis::FEATURE);
29+
k.EnableConcatKernelPerInput();
30+
return k;
31+
}
32+
33+
DeviceFeaturesKey ConcatenationKernel_b_fs_yx_fsv32::get_required_device_features_key(const Params& params) const {
34+
return get_common_subgroups_device_features_key(params);
35+
}
36+
37+
bool ConcatenationKernel_b_fs_yx_fsv32::Validate(const Params& p) const {
38+
if (!ConcatenationKernelBase::Validate(p)) {
39+
DO_NOT_USE_THIS_KERNEL(p.layerID);
40+
}
41+
42+
const concatenation_params& params = static_cast<const concatenation_params&>(p);
43+
44+
if (params.axis != ConcatAxis::FEATURE)
45+
DO_NOT_USE_THIS_KERNEL(p.layerID);
46+
47+
// All inputs must have the same layout
48+
auto same_layout = params.inputs[0].GetLayout();
49+
for (const auto& lt : params.inputs) {
50+
if (lt.GetLayout() != same_layout) {
51+
DO_NOT_USE_THIS_KERNEL(p.layerID);
52+
}
53+
}
54+
55+
return true;
56+
}
57+
58+
ConcatenationKernelBase::DispatchData ConcatenationKernel_b_fs_yx_fsv32::SetDefault(const concatenation_params& params) const {
59+
DispatchData dispatchData = ConcatenationKernelBase::SetDefault(params);
60+
const auto& input = params.inputs[0];
61+
62+
dispatchData.gws[0] = input.X().v;
63+
dispatchData.gws[1] = input.Y().v;
64+
dispatchData.gws[2] = CeilDiv(input.Feature().v, fsv) * subGroupSize * input.Batch().v;
65+
66+
dispatchData.lws[0] = 1;
67+
dispatchData.lws[1] = 1;
68+
dispatchData.lws[2] = subGroupSize;
69+
70+
return dispatchData;
71+
}
72+
73+
KernelsPriority ConcatenationKernel_b_fs_yx_fsv32::GetKernelsPriority(const Params& /*params*/) const {
74+
return FORCE_PRIORITY_1;
75+
}
76+
77+
JitConstants ConcatenationKernel_b_fs_yx_fsv32::GetJitConstants(const concatenation_params& params) const {
78+
JitConstants jit = MakeBaseParamsJitConstants(params);
79+
80+
jit.AddConstant(MakeJitConstant("ALIGNED", params.isAligned));
81+
jit.AddConstant(MakeJitConstant("FSV", fsv));
82+
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", subGroupSize));
83+
jit.AddConstant(MakeJitConstant("FSV_PER_THREAD", fsvPerThread));
84+
85+
return jit;
86+
}
87+
88+
size_t ConcatenationKernel_b_fs_yx_fsv32::GetAlignment(const concatenation_params& /*params*/) const {
89+
return fsv;
90+
}
91+
92+
KernelsData ConcatenationKernel_b_fs_yx_fsv32::GetKernelsData(const Params& params) const {
93+
if (!Validate(params)) {
94+
return {};
95+
}
96+
97+
const concatenation_params& orgParams = static_cast<const concatenation_params&>(params);
98+
99+
KernelData kd = KernelData::Default<concatenation_params>(params, orgParams.inputs.size());
100+
101+
uint32_t lastOffset = 0;
102+
size_t ifm_offset = 0;
103+
for (size_t i = 0; i < orgParams.inputs.size(); i++) {
104+
const auto& input = orgParams.inputs[i];
105+
106+
auto newParams = orgParams;
107+
newParams.inputs.resize(1);
108+
newParams.inputs[0] = input;
109+
size_t ifm = input.Feature().v;
110+
newParams.isAligned = ifm_offset % fsv == 0;
111+
ifm_offset += ifm;
112+
113+
auto& kernel = kd.kernels[i];
114+
DispatchData dispatchData = SetDefault(newParams);
115+
auto cldnnJit = GetJitConstants(newParams);
116+
auto entryPoint = GetEntryPoint(kernelName, newParams.layerID, params, i);
117+
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
118+
119+
kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
120+
kernel.params.workGroups.global = dispatchData.gws;
121+
kernel.params.workGroups.local = dispatchData.lws;
122+
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, (uint32_t)i});
123+
kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
124+
kernel.skip_execution = KernelData::SkipKernelExecution(newParams);
125+
126+
ScalarDescriptor s;
127+
s.t = ScalarDescriptor::Types::UINT32;
128+
s.v.u32 = lastOffset;
129+
kernel.params.scalars.push_back(s);
130+
kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
131+
132+
auto concatChannelIndex = DataTensor::Channelndex(orgParams.inputs[i].GetLayout(), GetConcatChannel(orgParams));
133+
OPENVINO_ASSERT(concatChannelIndex >= 0, "concatChannelIndex shouldn't be negative");
134+
lastOffset += (uint32_t)input.GetDims()[concatChannelIndex].v;
135+
}
136+
137+
return {kd};
138+
}
139+
140+
} // namespace kernel_selector
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright (C) 2018-2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "concatenation_kernel_base.h"
8+
9+
namespace kernel_selector {
10+
11+
class ConcatenationKernel_b_fs_yx_fsv32 : public ConcatenationKernelBase {
12+
public:
13+
ConcatenationKernel_b_fs_yx_fsv32() : ConcatenationKernelBase("concatenation_gpu_b_fs_yx_fsv32") {}
14+
virtual ~ConcatenationKernel_b_fs_yx_fsv32() {}
15+
16+
KernelsData GetKernelsData(const Params& params) const override;
17+
KernelsPriority GetKernelsPriority(const Params& params) const override;
18+
ParamsKey GetSupportedKey() const override;
19+
DeviceFeaturesKey get_required_device_features_key(const Params& params) const override;
20+
DispatchData SetDefault(const concatenation_params& params) const override;
21+
JitConstants GetJitConstants(const concatenation_params& params) const override;
22+
bool Validate(const Params& p) const override;
23+
size_t GetAlignment(const concatenation_params& params) const override;
24+
};
25+
26+
} // namespace kernel_selector

src/plugins/intel_gpu/src/kernel_selector/kernels/concatenation/concatenation_kernel_selector.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "concatenation_kernel_simple_ref.h"
88
#include "concatenation_kernel_depth_bfyx_no_pitch.h"
99
#include "concatenation_kernel_b_fs_yx_fsv16.h"
10+
#include "concatenation_kernel_b_fs_yx_fsv32.h"
1011
#include "concatenation_kernel_fs_b_yx_fsv32.h"
1112

1213
namespace kernel_selector {
@@ -15,6 +16,7 @@ concatenation_kernel_selector::concatenation_kernel_selector() {
1516
Attach<ConcatenationKernel_simple_Ref>();
1617
Attach<ConcatenationKernel_depth_bfyx_no_pitch>();
1718
Attach<ConcatenationKernel_b_fs_yx_fsv16>();
19+
Attach<ConcatenationKernel_b_fs_yx_fsv32>();
1820
Attach<ConcatenationKernel_fs_b_yx_fsv32>();
1921
}
2022

src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2182,3 +2182,135 @@ INSTANTIATE_TEST_SUITE_P(smoke,
21822182
),
21832183
concat_gpu_implicit::PrintToStringParamName);
21842184
#endif
2185+
// ============================================================================
2186+
// Tests for concatenation_gpu_b_fs_yx_fsv32 kernel (int8/uint8, feature axis)
2187+
// Forces OCL impl with b_fs_yx_fsv32 format to ensure the optimized kernel
2188+
// is selected instead of falling back to ref kernel.
2189+
// ============================================================================
2190+
using TestParamType_concat_fsv32 = ::testing::tuple<
2191+
size_t, // 0 - Batch
2192+
std::vector<size_t>, // 1 - Input feature sizes
2193+
size_t, // 2 - Input Y
2194+
size_t, // 3 - Input X
2195+
data_types>; // 4 - Data type (i8/u8)
2196+
2197+
struct concat_gpu_b_fs_yx_fsv32_force : public ::testing::TestWithParam<TestParamType_concat_fsv32> {
2198+
tests::random_generator rg;
2199+
2200+
void SetUp() override {
2201+
rg.set_seed(GET_SUITE_NAME);
2202+
}
2203+
2204+
static std::string PrintToStringParamName(testing::TestParamInfo<TestParamType_concat_fsv32> param_info) {
2205+
auto batch = testing::get<0>(param_info.param);
2206+
auto& feats = testing::get<1>(param_info.param);
2207+
auto y = testing::get<2>(param_info.param);
2208+
auto x = testing::get<3>(param_info.param);
2209+
auto dt = testing::get<4>(param_info.param);
2210+
std::string feat_str;
2211+
for (size_t i = 0; i < feats.size(); i++)
2212+
feat_str += (i ? "_" : "") + std::to_string(feats[i]);
2213+
return "b" + std::to_string(batch) + "_f" + feat_str +
2214+
"_y" + std::to_string(y) + "_x" + std::to_string(x) +
2215+
"_" + ov::element::Type(dt).get_type_name();
2216+
}
2217+
2218+
void test() {
2219+
auto& engine = get_test_engine();
2220+
const auto batch_num = testing::get<0>(GetParam());
2221+
const auto& in_features = testing::get<1>(GetParam());
2222+
const auto input_y = testing::get<2>(GetParam());
2223+
const auto input_x = testing::get<3>(GetParam());
2224+
const auto dt = testing::get<4>(GetParam());
2225+
2226+
topology topology;
2227+
std::vector<memory::ptr> in_memory;
2228+
std::vector<input_info> input_ids;
2229+
2230+
for (size_t i = 0; i < in_features.size(); i++) {
2231+
auto sz = tensor(static_cast<int32_t>(batch_num), static_cast<int32_t>(in_features[i]),
2232+
static_cast<int32_t>(input_x), static_cast<int32_t>(input_y));
2233+
auto in_lay = layout(dt, format::b_fs_yx_fsv32, sz);
2234+
auto in_mem = engine.allocate_memory(in_lay);
2235+
auto count = static_cast<int>(batch_num * in_features[i] * input_y * input_x);
2236+
if (dt == data_types::i8)
2237+
set_values<int8_t>(in_mem, rg.generate_random_1d<int8_t>(count, -50, 50));
2238+
else
2239+
set_values<uint8_t>(in_mem, rg.generate_random_1d<uint8_t>(count, 0, 200));
2240+
in_memory.push_back(in_mem);
2241+
topology.add(input_layout("input" + std::to_string(i), in_lay));
2242+
input_ids.push_back(input_info("input" + std::to_string(i)));
2243+
}
2244+
2245+
topology.add(concatenation("concat", input_ids, 1, dt));
2246+
topology.add(reorder("output", input_info("concat"), format::bfyx, dt));
2247+
2248+
ExecutionConfig config = get_test_default_config(engine);
2249+
config.set_property(ov::intel_gpu::optimize_data(true));
2250+
ov::intel_gpu::ImplementationDesc impl = { format::b_fs_yx_fsv32, "concatenation_gpu_b_fs_yx_fsv32", impl_types::ocl };
2251+
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "concat", impl } }));
2252+
2253+
network network(engine, topology, config);
2254+
for (size_t i = 0; i < in_features.size(); i++)
2255+
network.set_input_data(input_ids[i].pid, in_memory[i]);
2256+
2257+
auto outputs = network.execute();
2258+
auto out_mem = outputs.at("output").get_memory();
2259+
auto out_lay = out_mem->get_layout();
2260+
size_t total_f = 0;
2261+
for (auto f : in_features) total_f += f;
2262+
ASSERT_EQ(out_lay.feature(), static_cast<int>(total_f));
2263+
2264+
// Verify element-by-element against inputs read in logical order
2265+
size_t f_offset = 0;
2266+
for (size_t in_i = 0; in_i < in_features.size(); in_i++) {
2267+
for (size_t b = 0; b < batch_num; b++) {
2268+
for (size_t f = 0; f < in_features[in_i]; f++) {
2269+
for (size_t y = 0; y < input_y; y++) {
2270+
for (size_t x = 0; x < input_x; x++) {
2271+
auto in_coords = tensor(batch(b), feature(f), spatial(x, y, 0, 0));
2272+
auto out_coords = tensor(batch(b), feature(f_offset + f), spatial(x, y, 0, 0));
2273+
if (dt == data_types::i8) {
2274+
cldnn::mem_lock<int8_t> in_ptr(in_memory[in_i], get_test_stream());
2275+
cldnn::mem_lock<int8_t> out_ptr(out_mem, get_test_stream());
2276+
ASSERT_EQ(in_ptr[in_memory[in_i]->get_layout().get_linear_offset(in_coords)],
2277+
out_ptr[out_lay.get_linear_offset(out_coords)])
2278+
<< " b=" << b << " f=" << f_offset + f << " y=" << y << " x=" << x;
2279+
} else {
2280+
cldnn::mem_lock<uint8_t> in_ptr(in_memory[in_i], get_test_stream());
2281+
cldnn::mem_lock<uint8_t> out_ptr(out_mem, get_test_stream());
2282+
ASSERT_EQ(in_ptr[in_memory[in_i]->get_layout().get_linear_offset(in_coords)],
2283+
out_ptr[out_lay.get_linear_offset(out_coords)])
2284+
<< " b=" << b << " f=" << f_offset + f << " y=" << y << " x=" << x;
2285+
}
2286+
}
2287+
}
2288+
}
2289+
}
2290+
f_offset += in_features[in_i];
2291+
}
2292+
}
2293+
};
2294+
2295+
TEST_P(concat_gpu_b_fs_yx_fsv32_force, feature_axis) {
2296+
ASSERT_NO_FATAL_FAILURE(test());
2297+
}
2298+
2299+
INSTANTIATE_TEST_SUITE_P(smoke,
2300+
concat_gpu_b_fs_yx_fsv32_force,
2301+
::testing::Values(
2302+
// Aligned features (all 32-aligned)
2303+
TestParamType_concat_fsv32(1, { 32, 64, 32 }, 2, 2, data_types::i8),
2304+
TestParamType_concat_fsv32(1, { 32, 64, 32 }, 2, 2, data_types::u8),
2305+
// Unaligned features
2306+
TestParamType_concat_fsv32(1, { 24, 48, 17 }, 3, 3, data_types::i8),
2307+
TestParamType_concat_fsv32(1, { 24, 48, 17 }, 3, 3, data_types::u8),
2308+
// Mixed aligned/unaligned, batch > 1
2309+
TestParamType_concat_fsv32(2, { 64, 33, 15 }, 2, 2, data_types::i8),
2310+
TestParamType_concat_fsv32(2, { 64, 33, 15 }, 2, 2, data_types::u8),
2311+
// Single small unaligned input
2312+
TestParamType_concat_fsv32(1, { 3, 5 }, 4, 4, data_types::i8),
2313+
// Large aligned
2314+
TestParamType_concat_fsv32(1, { 64, 64, 64, 64 }, 1, 1, data_types::i8)
2315+
),
2316+
concat_gpu_b_fs_yx_fsv32_force::PrintToStringParamName);

0 commit comments

Comments
 (0)