[GPU] Fix kernel cache collision for reselected impl after propagate_constants

wilson-seok · wilson-seok · commit 35377192a056 · 2026-03-16T15:59:36.000Z
When propagate_constants reselects a node's impl from dynamic to static,
build_implementations could assign the wrong kernel due to kernel_impl_params
key collision. Two nodes with identical params but different impls (one
shape-agnostic, one static) would share the same cache key, causing the
first-registered dynamic kernel to overwrite the reselected static kernel.

Fix: compile kernels immediately in try_reselect_impl_for_node after impl
reselection, and skip pre-compiled nodes in build_implementations to
preserve their correct kernels.

Added unit test to verify the reselected node retains its static kernel
after build_implementations runs.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp
@@ -7,6 +7,8 @@
 
 #include "intel_gpu/runtime/itt.hpp"
 
+#include <unordered_set>
+
 using namespace cldnn;
 
 void build_implementations::run(program& p) {
@@ -16,14 +18,30 @@ void build_implementations::run(program& p) {
     }
 
     auto& cache = p.get_kernels_cache();
+
+    // Nodes whose kernels were already compiled (e.g., during impl reselection
+    // in propagate_constants) have their kernel sources reset via
+    // reset_kernels_source(). For such nodes, OCL v2 implementations assert
+    // in get_kernels_source() if sources contain null entries. Detect
+    // pre-compiled nodes via non-empty get_kernels() and skip both
+    // add_kernels_source() and init_kernels() for them to preserve their
+    // pre-compiled kernels without triggering the assert.
+    std::unordered_set<program_node*> pre_compiled_nodes;
+
     for (auto& n : p.get_processing_order()) {
         if (auto impl = n->get_selected_impl()) {
+            if (!impl->get_kernels().empty()) {
+                pre_compiled_nodes.insert(n);
+                continue;
+            }
             auto params = n->get_kernel_impl_params();
             cache.add_kernels_source(*params, impl->get_kernels_source());
         }
     }
     cache.build_all();
     for (auto& n : p.get_processing_order()) {
+        if (pre_compiled_nodes.count(n))
+            continue;
         if (auto impl = n->get_selected_impl()) {
             auto params = n->get_kernel_impl_params();
             impl->init_kernels(cache, *params);
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp
@@ -67,6 +67,18 @@ void try_reselect_impl_for_node(program_node* node) {
     try {
         if (selected_impl_manager) {
             node->set_selected_impl(selected_impl_manager->create(*node, *params));
+            // Compile kernels immediately for the reselected impl.
+            // This avoids kernel_impl_params key collisions in build_implementations
+            // where another node with the same params but a different (dynamic) impl
+            // would shadow this node's kernel source in the kernels cache.
+            auto impl = node->get_selected_impl();
+            auto kernel_sources = impl->get_kernels_source();
+            if (!kernel_sources.empty()) {
+                auto& kernels_cache = node->get_program().get_kernels_cache();
+                auto kernels = kernels_cache.compile(*params, kernel_sources);
+                impl->set_kernels(kernels);
+                impl->reset_kernels_source();
+            }
         } else {
             fail_reason = "choose_impl returned nullptr (no matching implementation found)";
         }
diff --git a/src/plugins/intel_gpu/tests/unit/passes/propagate_constants_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/propagate_constants_test.cpp
@@ -192,3 +192,110 @@ TEST(propagate_constants, no_reselection_when_constants_are_static) {
     ASSERT_FALSE(impl_after->is_dynamic());
     ASSERT_EQ(impl_before, impl_after);
 }
+
+// Verifies that build_implementations does not overwrite a reselected node's
+// pre-compiled kernel with a wrong (dynamic/shape-agnostic) kernel from another
+// node that shares the same kernel_impl_params key.
+//
+// This is the regression test for the kernel cache collision bug:
+// When propagate_constants reselects an impl from dynamic to static, the node
+// gets a static kernel compiled immediately. Meanwhile, another node with the
+// same kernel_impl_params (same layout/primitive config) but a dynamic impl
+// has a shape-agnostic (__sa) kernel. Without the fix, build_implementations'
+// add_kernels_source would register the dynamic node's __sa kernel first under
+// the shared params key, and init_kernels would then assign that __sa kernel to
+// the reselected static node — causing CL_INVALID_KERNEL_ARGS at runtime.
+//
+// Topology:
+//   input_layout("input_static")  -----------> eltwise("eltwise_reselected", sum) <-- transitions dyn→static
+//   input_layout("input_dynamic") -----------> eltwise("eltwise_dynamic", sum)    <-- stays dynamic
+//   data("weights_a") ---+
+//                         eltwise("w_sum") --> (shared constant input for both)
+//   data("weights_b") ---+
+//
+// Both eltwises have the same layout ({1,3,4,4} f32 bfyx) and same eltwise_mode,
+// so their kernel_impl_params are equal — triggering the cache key collision.
+// After propagate_constants + build_implementations, the reselected node must
+// have a non-__sa kernel and the dynamic node must have a __sa kernel.
+TEST(propagate_constants, build_implementations_preserves_reselected_kernel) {
+    auto& engine = get_test_engine();
+
+    auto static_input_layout = layout{{1, 3, 4, 4}, data_types::f32, format::bfyx};
+    auto dynamic_input_layout = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
+
+    topology topology(
+        input_layout("input_static", static_input_layout),
+        input_layout("input_dynamic", dynamic_input_layout),
+        data("weights_a", engine.allocate_memory(layout{{1, 3, 4, 4}, data_types::f32, format::bfyx})),
+        data("weights_b", engine.allocate_memory(layout{{1, 3, 4, 4}, data_types::f32, format::bfyx})),
+        eltwise("w_sum", input_info("weights_a"), input_info("weights_b"), eltwise_mode::sum),
+        eltwise("eltwise_reselected", input_info("input_static"), input_info("w_sum"), eltwise_mode::sum),
+        eltwise("eltwise_dynamic", input_info("input_dynamic"), input_info("w_sum"), eltwise_mode::sum)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    auto prog = program::build_program(engine, topology, config, false, true);
+
+    // Simulate unresolved dynamic shape on the constant computation node.
+    auto& w_sum_node = prog->get_node("w_sum");
+    auto dyn_layout = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
+    w_sum_node.set_output_layout(dyn_layout, true);
+
+    program_wrapper::apply_opt_pass<compile_graph>(*prog);
+
+    // After compile_graph both eltwises should be dynamic (one input is dynamic).
+    auto& reselected_node = prog->get_node("eltwise_reselected");
+    auto& dynamic_node = prog->get_node("eltwise_dynamic");
+    ASSERT_TRUE(reselected_node.get_selected_impl() == nullptr ||
+                reselected_node.get_selected_impl()->is_dynamic());
+    ASSERT_TRUE(dynamic_node.get_selected_impl() == nullptr ||
+                dynamic_node.get_selected_impl()->is_dynamic());
+
+    // propagate_constants reselects eltwise_reselected to static impl and
+    // immediately compiles its kernel.
+    program_wrapper::apply_opt_pass<propagate_constants>(*prog);
+
+    auto reselected_impl = reselected_node.get_selected_impl();
+    ASSERT_NE(reselected_impl, nullptr);
+    ASSERT_FALSE(reselected_impl->is_dynamic());
+
+    // The reselected impl should already have compiled kernels (non-empty)
+    // and its kernel sources should be reset (indicating pre-compilation).
+    auto reselected_kernels_before = reselected_impl->get_kernels();
+    ASSERT_FALSE(reselected_kernels_before.empty());
+
+    // Now run build_implementations — this is where the collision would occur
+    // without the fix.
+    program_wrapper::apply_opt_pass<build_implementations>(*prog);
+
+    // After build_implementations, verify the reselected node still has
+    // a valid kernel that is NOT a shape-agnostic (__sa) kernel.
+    auto reselected_kernels_after = reselected_impl->get_kernels();
+    ASSERT_FALSE(reselected_kernels_after.empty());
+    for (const auto& kernel : reselected_kernels_after) {
+        auto kernel_id = kernel->get_id();
+        // A static impl's kernel must NOT have the __sa suffix
+        ASSERT_EQ(kernel_id.find("__sa"), std::string::npos)
+            << "Reselected static node got shape-agnostic kernel: " << kernel_id;
+    }
+
+    // The dynamic node should have a shape-agnostic kernel (if it has an impl).
+    auto dynamic_impl = dynamic_node.get_selected_impl();
+    if (dynamic_impl && dynamic_impl->is_dynamic()) {
+        auto dynamic_kernels = dynamic_impl->get_kernels();
+        if (!dynamic_kernels.empty()) {
+            bool has_sa_kernel = false;
+            for (const auto& kernel : dynamic_kernels) {
+                if (kernel->get_id().find("__sa") != std::string::npos) {
+                    has_sa_kernel = true;
+                    break;
+                }
+            }
+            ASSERT_TRUE(has_sa_kernel)
+                << "Dynamic node should have a shape-agnostic (__sa) kernel";
+        }
+    }
+}