intel · AlexeySachkov · Sep 8, 2025 · Aug 27, 2025
@@ -184,11 +184,40 @@ lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT,
 
 static void lowerLocalMemCall(Function *LocalMemAllocFunc,
                               std::function<void(CallInst *CI)> TransformCall) {
+  static SmallPtrSet<Function *, 16> FuncsCache;
   SmallVector<CallInst *, 4> DelCalls;
   for (User *U : LocalMemAllocFunc->users()) {
     auto *CI = cast<CallInst>(U);
     TransformCall(CI);
     DelCalls.push_back(CI);
+    // Now, take each kernel that calls the builtins that allocate local memory,
+    // either directly or through a series of function calls that eventually end
+    // up in a direct call to the builtin, and attach the
+    // work-group-memory-static attribute to the kernel if not already attached.
+    // This is needed because free function kernels do not have the attribute
+    // added by the library as is the case with other types of kernels.
+    if (!FuncsCache.insert(CI->getFunction()).second)
+      continue; // We have already traversed call graph from this function.
+
+    SmallVector<Function *, 8> WorkList;
+    WorkList.push_back(CI->getFunction());
+    while (!WorkList.empty()) {
+      Function *F = WorkList.back();
+      WorkList.pop_back();
+
+      // Mark kernel as using scratch memory if it isn't marked already.
+      if (F->getCallingConv() == CallingConv::SPIR_KERNEL &&
+          !F->hasFnAttribute(WORK_GROUP_STATIC_ATTR))
+        F->addFnAttr(WORK_GROUP_STATIC_ATTR);
+
+      for (auto *FU : F->users()) {
+        if (auto *UCI = dyn_cast<CallInst>(FU)) {
+          if (FuncsCache.insert(UCI->getFunction()).second)
+            WorkList.push_back(UCI->getFunction());
+        } // Even though there could be other uses of a Function, we don't
+          // care about them because we are only concerned about call graph.
+      }
+    }
   }
 
   for (auto *CI : DelCalls) {

@@ -22,9 +22,29 @@ entry:
   ret void
 }
 
+; Function Attrs: convergent norecurse
+; CHECK: @__sycl_kernel_B{{.*}} #[[ATTRS:[0-9]+]]
+define weak_odr dso_local spir_kernel void @__sycl_kernel_B(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 {
+entry:
+  %1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1
+  ret void
+}
+
+; Function Attrs: convergent norecurse
+; CHECK: @__sycl_kernel_C{{.*}} #[[ATTRS]]
+define weak_odr dso_local spir_kernel void @__sycl_kernel_C(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 {
+entry:
+  %1 = tail call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 128, i64 4) #1
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64, i64) local_unnamed_addr #1
+
 ; Function Attrs: convergent
 declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1
 
+; CHECK: #[[ATTRS]] = {{.*}} "sycl-work-group-static"
 attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" }
 attributes #1 = { convergent norecurse }
 

@@ -0,0 +1,114 @@
+// REQUIRES: aspect-usm_shared_allocations
+// UNSUPPORTED: target-amd
+// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// This test verifies that we can compile, run and get correct results when
+// using a free function kernel that allocates shared local memory in a kernel
+// either by way of the work group scratch memory extension or the work group
+// static memory extension.
+
+#include "helpers.hpp"
+
+#include <cassert>
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+#include <sycl/ext/oneapi/work_group_static.hpp>
+#include <sycl/group_barrier.hpp>
+#include <sycl/usm.hpp>
+
+namespace syclext = sycl::ext::oneapi;
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+constexpr int SIZE = 16;
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
+void scratchKernel(float *Src, float *Dst) {
+  size_t Lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id();
+  float *LocalMem =
+      reinterpret_cast<float *>(syclexp::get_work_group_scratch_memory());
+  LocalMem[Lid] = 2 * Src[Lid];
+  Dst[Lid] = LocalMem[Lid];
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
+void staticKernel(float *Src, float *Dst) {
+  sycl::nd_item<1> Item = syclext::this_work_item::get_nd_item<1>();
+  size_t Lid = Item.get_local_linear_id();
+  syclexp::work_group_static<float[SIZE]> LocalMem;
+  LocalMem[Lid] = Src[Lid] * Src[Lid];
+  sycl::group_barrier(Item.get_group());
+  if (Item.get_group().leader()) { // Check that memory is indeed shared between
+                                   // the work group.
+    for (int I = 0; I < SIZE; ++I)
+      assert(LocalMem[I] == Src[I] * Src[I]);
+  }
+  Dst[Lid] = LocalMem[Lid];
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
+void scratchStaticKernel(float *Src, float *Dst) {
+  size_t Lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id();
+  float *ScratchMem =
+      reinterpret_cast<float *>(syclexp::get_work_group_scratch_memory());
+  syclexp::work_group_static<float[SIZE]> StaticMem;
+  ScratchMem[Lid] = Src[Lid];
+  StaticMem[Lid] = Src[Lid];
+  Dst[Lid] = ScratchMem[Lid] + StaticMem[Lid];
+}
+
+int main() {
+  sycl::queue Q;
+  float *Src = sycl::malloc_shared<float>(SIZE, Q);
+  float *Dst = sycl::malloc_shared<float>(SIZE, Q);
+
+  for (int I = 0; I < SIZE; I++) {
+    Src[I] = I;
+  }
+
+  auto ScratchBndl =
+      syclexp::get_kernel_bundle<scratchKernel, sycl::bundle_state::executable>(
+          Q.get_context());
+  auto StaticBndl =
+      syclexp::get_kernel_bundle<staticKernel, sycl::bundle_state::executable>(
+          Q.get_context());
+  auto ScratchStaticBndl = syclexp::get_kernel_bundle<
+      scratchStaticKernel, sycl::bundle_state::executable>(Q.get_context());
+
+  sycl::kernel ScratchKrn =
+      ScratchBndl.template ext_oneapi_get_kernel<scratchKernel>();
+  sycl::kernel StaticKrn =
+      StaticBndl.template ext_oneapi_get_kernel<staticKernel>();
+  sycl::kernel ScratchStaticKrn =
+      ScratchStaticBndl.template ext_oneapi_get_kernel<scratchStaticKernel>();
+  syclexp::launch_config ScratchKernelcfg{
+      ::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE)),
+      syclexp::properties{
+          syclexp::work_group_scratch_size(SIZE * sizeof(float))}};
+  syclexp::launch_config StaticKernelcfg{
+      ::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE))};
+
+  syclexp::nd_launch(Q, ScratchKernelcfg, ScratchKrn, Src, Dst);
+  Q.wait();
+  for (int I = 0; I < SIZE; I++) {
+    assert(Dst[I] == 2 * Src[I]);
+  }
+
+  syclexp::nd_launch(Q, StaticKernelcfg, StaticKrn, Src, Dst);
+  Q.wait();
+  for (int I = 0; I < SIZE; I++) {
+    assert(Dst[I] == Src[I] * Src[I]);
+  }
+
+  syclexp::nd_launch(Q, ScratchKernelcfg, ScratchStaticKrn, Src, Dst);
+  Q.wait();
+  for (int I = 0; I < SIZE; I++) {
+    assert(Dst[I] == 2 * Src[I]);
+  }
+
+  sycl::free(Src, Q);
+  sycl::free(Dst, Q);
+  return 0;
+}