Skip to content

Commit 2187cae

Browse files
committed
Fix smallest-index-first tiebreaking issue.
Deterministic smallest-index-first tiebreaking, machting TF/ONNX behavior. But it was non deterministic index because of atomic_add racing in phase 2b. And now phase 2b uses a hybrid approach. If all at-threshold elements fit in the remaining PADDED_K slots, it uses the fase parallel atomic_add gather (common case); Other it falls back to single work-item sequential scan. Signed-off-by: hyunback <hyunback.kim@intel.com>
1 parent 95ecfd4 commit 2187cae

File tree

2 files changed

+110
-11
lines changed

2 files changed

+110
-11
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/arg_max_min_topk_radix.cl

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -295,9 +295,12 @@ KERNEL(arg_max_min_topk_radix)(
295295
barrier(CLK_LOCAL_MEM_FENCE);
296296

297297
// ============================================================
298-
// Phase 2b: Gather ALL elements AT threshold into SLM (up to PADDED_K)
299-
// Gather more than needed; bitonic sort + combined key will deterministically
300-
// place the correct subset into the top-K positions.
298+
// Phase 2b: Gather elements AT threshold into SLM
299+
// Strategy: check if all at-threshold elements fit in remaining slots.
300+
// - Fast path (common): all fit. parallel atomic_add (no correctness issue)
301+
// - Slow path (rare): overflow. single WI sequential scan for deterministic
302+
// index ordering (smallest indices first, matching TF/ONNX behavior)
303+
// histogram[threshold & 0xFF] still holds the fine histogram count from Phase 1.
301304
// ============================================================
302305
uint current_count = min(gather_count, (uint)PADDED_K);
303306
if (lid == 0) {
@@ -306,18 +309,43 @@ KERNEL(arg_max_min_topk_radix)(
306309
barrier(CLK_LOCAL_MEM_FENCE);
307310

308311
if (current_count < PADDED_K) {
309-
for (uint i = lid; i < VALUES_NUM; i += WG_SIZE) {
310-
uint sortable = my_sortable[i];
311-
if (sortable == threshold) {
312-
uint pos = atomic_add(&gather_count, 1);
313-
if (pos < PADDED_K) {
312+
uint at_count = histogram[threshold & 0xFF];
313+
uint available_slots = PADDED_K - current_count;
314+
315+
if (at_count <= available_slots) {
316+
// Fast path: all at-threshold elements fit, parallel gather is safe
317+
for (uint i = lid; i < VALUES_NUM; i += WG_SIZE) {
318+
uint sortable = my_sortable[i];
319+
if (sortable == threshold) {
320+
uint pos = atomic_add(&gather_count, 1);
321+
if (pos < PADDED_K) {
322+
#ifdef MAX_OUT
323+
sort_keys[pos] = (sortable << 16) | (0xFFFF - (i & 0xFFFF));
324+
#else
325+
sort_keys[pos] = (sortable << 16) | (i & 0xFFFF);
326+
#endif
327+
sort_idxs[pos] = i;
328+
}
329+
}
330+
}
331+
} else {
332+
// Slow path: more at-threshold elements than slots,
333+
// sequential scan ensures smallest indices are selected first
334+
if (lid == 0) {
335+
uint pos = current_count;
336+
for (uint i = 0; i < VALUES_NUM && pos < PADDED_K; i++) {
337+
uint sortable = my_sortable[i];
338+
if (sortable == threshold) {
314339
#ifdef MAX_OUT
315-
sort_keys[pos] = (sortable << 16) | (0xFFFF - (i & 0xFFFF));
340+
sort_keys[pos] = (sortable << 16) | (0xFFFF - (i & 0xFFFF));
316341
#else
317-
sort_keys[pos] = (sortable << 16) | (i & 0xFFFF);
342+
sort_keys[pos] = (sortable << 16) | (i & 0xFFFF);
318343
#endif
319-
sort_idxs[pos] = i;
344+
sort_idxs[pos] = i;
345+
pos++;
346+
}
320347
}
348+
gather_count = pos;
321349
}
322350
}
323351
}

src/plugins/intel_gpu/tests/unit/test_cases/arg_max_gpu_test.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,3 +1343,74 @@ TEST(arg_max_gpu_topk_radix, f16_max_yolov26n_like) {
13431343
// Verify last top-K value is > base values (~0.99 max)
13441344
ASSERT_GT(static_cast<float>(output_ptr[top_k - 1]), 50.0f);
13451345
}
1346+
1347+
// Radix test: CI failure reproducer - many duplicate values, verify indices tiebreaking
1348+
// CI failed on shape [4,2,70] K=5 because at-threshold elements with same value
1349+
// had non-deterministic index selection in Phase 2b (atomic_add race).
1350+
// This test uses feature axis with N=70 duplicates to verify correct tiebreaking.
1351+
TEST(arg_max_gpu_topk_radix, f16_max_duplicates_n70_k5_indices_tiebreak) {
1352+
static const int32_t batch_num = 4, feature_num = 70, x_size = 1, y_size = 1;
1353+
auto& engine = get_test_engine();
1354+
const int top_k = 5;
1355+
auto input = engine.allocate_memory({data_types::f16, format::bfyx, {batch_num, feature_num, x_size, y_size}});
1356+
auto top_k_input = engine.allocate_memory({data_types::f16, format::bfyx, {1, 1, 1, 1}});
1357+
auto second_output = engine.allocate_memory({data_types::f32, format::bfyx, {batch_num, top_k, x_size, y_size}});
1358+
1359+
// Random ints 0-14, same distribution as CI test → many duplicates
1360+
std::vector<ov::float16> input_vec(batch_num * feature_num);
1361+
std::mt19937 rng(42);
1362+
std::uniform_int_distribution<int> dist(0, 14);
1363+
for (size_t i = 0; i < input_vec.size(); i++) {
1364+
input_vec[i] = ov::float16(static_cast<float>(dist(rng)));
1365+
}
1366+
set_values(input, input_vec);
1367+
1368+
topology topology;
1369+
topology.add(input_layout("input", input->get_layout()));
1370+
topology.add(cldnn::data("const", top_k_input));
1371+
topology.add(mutable_data("second_output", second_output));
1372+
topology.add(arg_max_min("arg_max",
1373+
{ input_info("input"), input_info("const"), input_info("second_output") },
1374+
ov::op::TopKMode::MAX,
1375+
top_k,
1376+
1, // axis=1 (feature)
1377+
ov::op::TopKSortType::SORT_VALUES,
1378+
true, // values_first
1379+
false,
1380+
data_types::f16));
1381+
1382+
network network(engine, topology, get_test_default_config(engine));
1383+
network.set_input_data("input", input);
1384+
auto outputs = network.execute();
1385+
1386+
auto output = outputs.at("arg_max").get_memory();
1387+
cldnn::mem_lock<ov::float16> val_ptr(output, get_test_stream());
1388+
cldnn::mem_lock<float> idx_ptr(second_output, get_test_stream());
1389+
1390+
// For each batch, verify values AND indices match CPU reference
1391+
for (int b = 0; b < batch_num; b++) {
1392+
int input_offset = b * feature_num;
1393+
int out_offset = b * top_k;
1394+
1395+
// CPU reference: sort by (value desc, index asc) via stable_sort
1396+
std::vector<std::pair<float, int>> ref;
1397+
for (int i = 0; i < feature_num; i++) {
1398+
ref.push_back({static_cast<float>(input_vec[input_offset + i]), i});
1399+
}
1400+
std::stable_sort(ref.begin(), ref.end(),
1401+
[](const auto& a, const auto& b) { return a.first > b.first; });
1402+
1403+
for (int k = 0; k < top_k; k++) {
1404+
float gpu_val = static_cast<float>(val_ptr[out_offset + k]);
1405+
float ref_val = ref[k].first;
1406+
ASSERT_NEAR(gpu_val, ref_val, 0.01f)
1407+
<< "value mismatch at b=" << b << " k=" << k;
1408+
1409+
int gpu_idx = static_cast<int>(idx_ptr[out_offset + k]);
1410+
int ref_idx = ref[k].second;
1411+
ASSERT_EQ(gpu_idx, ref_idx)
1412+
<< "index mismatch at b=" << b << " k=" << k
1413+
<< " (value=" << ref_val << ", expected idx=" << ref_idx << ", got idx=" << gpu_idx << ")";
1414+
}
1415+
}
1416+
}

0 commit comments

Comments
 (0)