Skip to content

Commit 3537719

Browse files
committed
[GPU] Fix kernel cache collision for reselected impl after propagate_constants
When propagate_constants reselects a node's impl from dynamic to static, build_implementations could assign the wrong kernel due to kernel_impl_params key collision. Two nodes with identical params but different impls (one shape-agnostic, one static) would share the same cache key, causing the first-registered dynamic kernel to overwrite the reselected static kernel. Fix: compile kernels immediately in try_reselect_impl_for_node after impl reselection, and skip pre-compiled nodes in build_implementations to preserve their correct kernels. Added unit test to verify the reselected node retains its static kernel after build_implementations runs.
1 parent b8ca92f commit 3537719

File tree

3 files changed

+137
-0
lines changed

3 files changed

+137
-0
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
#include "intel_gpu/runtime/itt.hpp"
99

10+
#include <unordered_set>
11+
1012
using namespace cldnn;
1113

1214
void build_implementations::run(program& p) {
@@ -16,14 +18,30 @@ void build_implementations::run(program& p) {
1618
}
1719

1820
auto& cache = p.get_kernels_cache();
21+
22+
// Nodes whose kernels were already compiled (e.g., during impl reselection
23+
// in propagate_constants) have their kernel sources reset via
24+
// reset_kernels_source(). For such nodes, OCL v2 implementations assert
25+
// in get_kernels_source() if sources contain null entries. Detect
26+
// pre-compiled nodes via non-empty get_kernels() and skip both
27+
// add_kernels_source() and init_kernels() for them to preserve their
28+
// pre-compiled kernels without triggering the assert.
29+
std::unordered_set<program_node*> pre_compiled_nodes;
30+
1931
for (auto& n : p.get_processing_order()) {
2032
if (auto impl = n->get_selected_impl()) {
33+
if (!impl->get_kernels().empty()) {
34+
pre_compiled_nodes.insert(n);
35+
continue;
36+
}
2137
auto params = n->get_kernel_impl_params();
2238
cache.add_kernels_source(*params, impl->get_kernels_source());
2339
}
2440
}
2541
cache.build_all();
2642
for (auto& n : p.get_processing_order()) {
43+
if (pre_compiled_nodes.count(n))
44+
continue;
2745
if (auto impl = n->get_selected_impl()) {
2846
auto params = n->get_kernel_impl_params();
2947
impl->init_kernels(cache, *params);

src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,18 @@ void try_reselect_impl_for_node(program_node* node) {
6767
try {
6868
if (selected_impl_manager) {
6969
node->set_selected_impl(selected_impl_manager->create(*node, *params));
70+
// Compile kernels immediately for the reselected impl.
71+
// This avoids kernel_impl_params key collisions in build_implementations
72+
// where another node with the same params but a different (dynamic) impl
73+
// would shadow this node's kernel source in the kernels cache.
74+
auto impl = node->get_selected_impl();
75+
auto kernel_sources = impl->get_kernels_source();
76+
if (!kernel_sources.empty()) {
77+
auto& kernels_cache = node->get_program().get_kernels_cache();
78+
auto kernels = kernels_cache.compile(*params, kernel_sources);
79+
impl->set_kernels(kernels);
80+
impl->reset_kernels_source();
81+
}
7082
} else {
7183
fail_reason = "choose_impl returned nullptr (no matching implementation found)";
7284
}

src/plugins/intel_gpu/tests/unit/passes/propagate_constants_test.cpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,110 @@ TEST(propagate_constants, no_reselection_when_constants_are_static) {
192192
ASSERT_FALSE(impl_after->is_dynamic());
193193
ASSERT_EQ(impl_before, impl_after);
194194
}
195+
196+
// Verifies that build_implementations does not overwrite a reselected node's
197+
// pre-compiled kernel with a wrong (dynamic/shape-agnostic) kernel from another
198+
// node that shares the same kernel_impl_params key.
199+
//
200+
// This is the regression test for the kernel cache collision bug:
201+
// When propagate_constants reselects an impl from dynamic to static, the node
202+
// gets a static kernel compiled immediately. Meanwhile, another node with the
203+
// same kernel_impl_params (same layout/primitive config) but a dynamic impl
204+
// has a shape-agnostic (__sa) kernel. Without the fix, build_implementations'
205+
// add_kernels_source would register the dynamic node's __sa kernel first under
206+
// the shared params key, and init_kernels would then assign that __sa kernel to
207+
// the reselected static node — causing CL_INVALID_KERNEL_ARGS at runtime.
208+
//
209+
// Topology:
210+
// input_layout("input_static") -----------> eltwise("eltwise_reselected", sum) <-- transitions dyn→static
211+
// input_layout("input_dynamic") -----------> eltwise("eltwise_dynamic", sum) <-- stays dynamic
212+
// data("weights_a") ---+
213+
// eltwise("w_sum") --> (shared constant input for both)
214+
// data("weights_b") ---+
215+
//
216+
// Both eltwises have the same layout ({1,3,4,4} f32 bfyx) and same eltwise_mode,
217+
// so their kernel_impl_params are equal — triggering the cache key collision.
218+
// After propagate_constants + build_implementations, the reselected node must
219+
// have a non-__sa kernel and the dynamic node must have a __sa kernel.
220+
TEST(propagate_constants, build_implementations_preserves_reselected_kernel) {
221+
auto& engine = get_test_engine();
222+
223+
auto static_input_layout = layout{{1, 3, 4, 4}, data_types::f32, format::bfyx};
224+
auto dynamic_input_layout = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
225+
226+
topology topology(
227+
input_layout("input_static", static_input_layout),
228+
input_layout("input_dynamic", dynamic_input_layout),
229+
data("weights_a", engine.allocate_memory(layout{{1, 3, 4, 4}, data_types::f32, format::bfyx})),
230+
data("weights_b", engine.allocate_memory(layout{{1, 3, 4, 4}, data_types::f32, format::bfyx})),
231+
eltwise("w_sum", input_info("weights_a"), input_info("weights_b"), eltwise_mode::sum),
232+
eltwise("eltwise_reselected", input_info("input_static"), input_info("w_sum"), eltwise_mode::sum),
233+
eltwise("eltwise_dynamic", input_info("input_dynamic"), input_info("w_sum"), eltwise_mode::sum)
234+
);
235+
236+
ExecutionConfig config = get_test_default_config(engine);
237+
config.set_property(ov::intel_gpu::optimize_data(true));
238+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
239+
240+
auto prog = program::build_program(engine, topology, config, false, true);
241+
242+
// Simulate unresolved dynamic shape on the constant computation node.
243+
auto& w_sum_node = prog->get_node("w_sum");
244+
auto dyn_layout = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
245+
w_sum_node.set_output_layout(dyn_layout, true);
246+
247+
program_wrapper::apply_opt_pass<compile_graph>(*prog);
248+
249+
// After compile_graph both eltwises should be dynamic (one input is dynamic).
250+
auto& reselected_node = prog->get_node("eltwise_reselected");
251+
auto& dynamic_node = prog->get_node("eltwise_dynamic");
252+
ASSERT_TRUE(reselected_node.get_selected_impl() == nullptr ||
253+
reselected_node.get_selected_impl()->is_dynamic());
254+
ASSERT_TRUE(dynamic_node.get_selected_impl() == nullptr ||
255+
dynamic_node.get_selected_impl()->is_dynamic());
256+
257+
// propagate_constants reselects eltwise_reselected to static impl and
258+
// immediately compiles its kernel.
259+
program_wrapper::apply_opt_pass<propagate_constants>(*prog);
260+
261+
auto reselected_impl = reselected_node.get_selected_impl();
262+
ASSERT_NE(reselected_impl, nullptr);
263+
ASSERT_FALSE(reselected_impl->is_dynamic());
264+
265+
// The reselected impl should already have compiled kernels (non-empty)
266+
// and its kernel sources should be reset (indicating pre-compilation).
267+
auto reselected_kernels_before = reselected_impl->get_kernels();
268+
ASSERT_FALSE(reselected_kernels_before.empty());
269+
270+
// Now run build_implementations — this is where the collision would occur
271+
// without the fix.
272+
program_wrapper::apply_opt_pass<build_implementations>(*prog);
273+
274+
// After build_implementations, verify the reselected node still has
275+
// a valid kernel that is NOT a shape-agnostic (__sa) kernel.
276+
auto reselected_kernels_after = reselected_impl->get_kernels();
277+
ASSERT_FALSE(reselected_kernels_after.empty());
278+
for (const auto& kernel : reselected_kernels_after) {
279+
auto kernel_id = kernel->get_id();
280+
// A static impl's kernel must NOT have the __sa suffix
281+
ASSERT_EQ(kernel_id.find("__sa"), std::string::npos)
282+
<< "Reselected static node got shape-agnostic kernel: " << kernel_id;
283+
}
284+
285+
// The dynamic node should have a shape-agnostic kernel (if it has an impl).
286+
auto dynamic_impl = dynamic_node.get_selected_impl();
287+
if (dynamic_impl && dynamic_impl->is_dynamic()) {
288+
auto dynamic_kernels = dynamic_impl->get_kernels();
289+
if (!dynamic_kernels.empty()) {
290+
bool has_sa_kernel = false;
291+
for (const auto& kernel : dynamic_kernels) {
292+
if (kernel->get_id().find("__sa") != std::string::npos) {
293+
has_sa_kernel = true;
294+
break;
295+
}
296+
}
297+
ASSERT_TRUE(has_sa_kernel)
298+
<< "Dynamic node should have a shape-agnostic (__sa) kernel";
299+
}
300+
}
301+
}

0 commit comments

Comments
 (0)