Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,40 @@ lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT,

static void lowerLocalMemCall(Function *LocalMemAllocFunc,
std::function<void(CallInst *CI)> TransformCall) {
static SmallPtrSet<Function *, 16> FuncsCache;
SmallVector<CallInst *, 4> DelCalls;
for (User *U : LocalMemAllocFunc->users()) {
auto *CI = cast<CallInst>(U);
TransformCall(CI);
DelCalls.push_back(CI);
// Now, take each kernel that calls the builtins that allocate local memory,
// either directly or through a series of function calls that eventually end
// up in a direct call to the builtin, and attach the
// work-group-memory-static attribute to the kernel if not already attached.
// This is needed because free function kernels do not have the attribute
// added by the library as is the case with other types of kernels.
if (!FuncsCache.insert(CI->getFunction()).second)
continue; // We have already traversed call graph from this function.

SmallVector<Function *, 8> WorkList;
WorkList.push_back(CI->getFunction());
while (!WorkList.empty()) {
Function *F = WorkList.back();
WorkList.pop_back();

// Mark kernel as using scratch memory if it isn't marked already.
if (F->getCallingConv() == CallingConv::SPIR_KERNEL &&
!F->hasFnAttribute(WORK_GROUP_STATIC_ATTR))
F->addFnAttr(WORK_GROUP_STATIC_ATTR);

for (auto *FU : F->users()) {
if (auto *UCI = dyn_cast<CallInst>(FU)) {
if (FuncsCache.insert(UCI->getFunction()).second)
WorkList.push_back(UCI->getFunction());
} // Even though there could be other uses of a Function, we don't
// care about them because we are only concerned about call graph.
}
}
}

for (auto *CI : DelCalls) {
Expand Down
20 changes: 20 additions & 0 deletions llvm/test/SYCLLowerIR/work_group_static.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,29 @@ entry:
ret void
}

; Function Attrs: convergent norecurse
; CHECK: @__sycl_kernel_B{{.*}} #[[ATTRS:[0-9]+]]
define weak_odr dso_local spir_kernel void @__sycl_kernel_B(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 {
entry:
%1 = tail call spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64 128) #1
ret void
}

; Function Attrs: convergent norecurse
; CHECK: @__sycl_kernel_C{{.*}} #[[ATTRS]]
define weak_odr dso_local spir_kernel void @__sycl_kernel_C(ptr addrspace(1) %0) local_unnamed_addr #1 !kernel_arg_addr_space !5 {
entry:
%1 = tail call spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64 128, i64 4) #1
ret void
}

; Function Attrs: convergent
declare dso_local spir_func ptr addrspace(3) @__sycl_allocateLocalMemory(i64, i64) local_unnamed_addr #1

; Function Attrs: convergent
declare dso_local spir_func ptr addrspace(3) @__sycl_dynamicLocalMemoryPlaceholder(i64) local_unnamed_addr #1

; CHECK: #[[ATTRS]] = {{.*}} "sycl-work-group-static"
attributes #0 = { convergent norecurse "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" "sycl-work-group-static"="1" }
attributes #1 = { convergent norecurse }

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// REQUIRES: aspect-usm_shared_allocations
// UNSUPPORTED: target-amd
// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16072

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// This test verifies that we can compile, run and get correct results when
// using a free function kernel that allocates shared local memory in a kernel
// either by way of the work group scratch memory extension or the work group
// static memory extension.

#include "helpers.hpp"

#include <cassert>
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
#include <sycl/ext/oneapi/free_function_queries.hpp>
#include <sycl/ext/oneapi/work_group_static.hpp>
#include <sycl/group_barrier.hpp>
#include <sycl/usm.hpp>

namespace syclext = sycl::ext::oneapi;
namespace syclexp = sycl::ext::oneapi::experimental;

constexpr int SIZE = 16;

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
void scratchKernel(float *Src, float *Dst) {
size_t Lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id();
float *LocalMem =
reinterpret_cast<float *>(syclexp::get_work_group_scratch_memory());
LocalMem[Lid] = 2 * Src[Lid];
Dst[Lid] = LocalMem[Lid];
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
void staticKernel(float *Src, float *Dst) {
sycl::nd_item<1> Item = syclext::this_work_item::get_nd_item<1>();
size_t Lid = Item.get_local_linear_id();
syclexp::work_group_static<float[SIZE]> LocalMem;
LocalMem[Lid] = Src[Lid] * Src[Lid];
sycl::group_barrier(Item.get_group());
if (Item.get_group().leader()) { // Check that memory is indeed shared between
// the work group.
for (int I = 0; I < SIZE; ++I)
assert(LocalMem[I] == Src[I] * Src[I]);
}
Dst[Lid] = LocalMem[Lid];
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
void scratchStaticKernel(float *Src, float *Dst) {
size_t Lid = syclext::this_work_item::get_nd_item<1>().get_local_linear_id();
float *ScratchMem =
reinterpret_cast<float *>(syclexp::get_work_group_scratch_memory());
syclexp::work_group_static<float[SIZE]> StaticMem;
ScratchMem[Lid] = Src[Lid];
StaticMem[Lid] = Src[Lid];
Dst[Lid] = ScratchMem[Lid] + StaticMem[Lid];
}

int main() {
sycl::queue Q;
float *Src = sycl::malloc_shared<float>(SIZE, Q);
float *Dst = sycl::malloc_shared<float>(SIZE, Q);

for (int I = 0; I < SIZE; I++) {
Src[I] = I;
}

auto ScratchBndl =
syclexp::get_kernel_bundle<scratchKernel, sycl::bundle_state::executable>(
Q.get_context());
auto StaticBndl =
syclexp::get_kernel_bundle<staticKernel, sycl::bundle_state::executable>(
Q.get_context());
auto ScratchStaticBndl = syclexp::get_kernel_bundle<
scratchStaticKernel, sycl::bundle_state::executable>(Q.get_context());

sycl::kernel ScratchKrn =
ScratchBndl.template ext_oneapi_get_kernel<scratchKernel>();
sycl::kernel StaticKrn =
StaticBndl.template ext_oneapi_get_kernel<staticKernel>();
sycl::kernel ScratchStaticKrn =
ScratchStaticBndl.template ext_oneapi_get_kernel<scratchStaticKernel>();
syclexp::launch_config ScratchKernelcfg{
::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE)),
syclexp::properties{
syclexp::work_group_scratch_size(SIZE * sizeof(float))}};
syclexp::launch_config StaticKernelcfg{
::sycl::nd_range<1>(::sycl::range<1>(SIZE), ::sycl::range<1>(SIZE))};

syclexp::nd_launch(Q, ScratchKernelcfg, ScratchKrn, Src, Dst);
Q.wait();
for (int I = 0; I < SIZE; I++) {
assert(Dst[I] == 2 * Src[I]);
}

syclexp::nd_launch(Q, StaticKernelcfg, StaticKrn, Src, Dst);
Q.wait();
for (int I = 0; I < SIZE; I++) {
assert(Dst[I] == Src[I] * Src[I]);
}

syclexp::nd_launch(Q, ScratchKernelcfg, ScratchStaticKrn, Src, Dst);
Q.wait();
for (int I = 0; I < SIZE; I++) {
assert(Dst[I] == 2 * Src[I]);
}

sycl::free(Src, Q);
sycl::free(Dst, Q);
return 0;
}
Loading