Skip to content

[SYCL][NCFI] Refactor method of storing extra information obtained from kernel launch properties #19474

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: sycl
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions sycl/source/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ class NDRDescT {
std::array<size_t, 3> ClusterDimensions{1, 1, 1};
size_t Dims = 0;
};
struct CustomLaunchArguments {
size_t KernelWorkGroupMemorySize = 0;
bool KernelIsCooperative = false;
bool KernelUsesClusterLaunch = false;
};

/// Base class for all types of command groups.
class CG {
Expand Down Expand Up @@ -261,9 +266,7 @@ class CGExecKernel : public CG {
/// of command-groups that a kernel command can be updated to.
std::vector<std::weak_ptr<CGExecKernel>> MAlternativeKernels;
ur_kernel_cache_config_t MKernelCacheConfig;
bool MKernelIsCooperative = false;
bool MKernelUsesClusterLaunch = false;
size_t MKernelWorkGroupMemorySize = 0;
CustomLaunchArguments MCustomLaunchArgs;

CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
std::shared_ptr<detail::kernel_impl> SyclKernel,
Expand All @@ -274,8 +277,8 @@ class CGExecKernel : public CG {
std::vector<std::shared_ptr<detail::stream_impl>> Streams,
std::vector<std::shared_ptr<const void>> AuxiliaryResources,
CGType Type, ur_kernel_cache_config_t KernelCacheConfig,
bool KernelIsCooperative, bool MKernelUsesClusterLaunch,
size_t KernelWorkGroupMemorySize, detail::code_location loc = {})
CustomLaunchArguments CustomLaunchArgs,
detail::code_location loc = {})
: CG(Type, std::move(CGData), std::move(loc)),
MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
MSyclKernel(std::move(SyclKernel)),
Expand All @@ -285,9 +288,7 @@ class CGExecKernel : public CG {
MStreams(std::move(Streams)),
MAuxiliaryResources(std::move(AuxiliaryResources)),
MAlternativeKernels{}, MKernelCacheConfig(std::move(KernelCacheConfig)),
MKernelIsCooperative(KernelIsCooperative),
MKernelUsesClusterLaunch(MKernelUsesClusterLaunch),
MKernelWorkGroupMemorySize(KernelWorkGroupMemorySize) {
MCustomLaunchArgs(CustomLaunchArgs) {
assert(getType() == CGType::Kernel && "Wrong type of exec kernel CG.");
}

Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/handler_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ class handler_impl {

ur_kernel_cache_config_t MKernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT;

bool MKernelIsCooperative = false;
bool MKernelUsesClusterLaunch = false;
uint32_t MKernelWorkGroupMemorySize = 0;
// Extra information for custom kernel launch passed through property
// processing.
CustomLaunchArguments MCustomLaunchArgs = {};

// Extra information for bindless image copy
ur_image_desc_t MSrcImageDesc = {};
Expand Down
47 changes: 25 additions & 22 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,10 @@ bool Command::isFusable() const {
}
const auto &CG = (static_cast<const ExecCGCommand &>(*this)).getCG();
return (CG.getType() == CGType::Kernel) &&
(!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative) &&
(!static_cast<const CGExecKernel &>(CG).MKernelUsesClusterLaunch);
(!static_cast<const CGExecKernel &>(CG)
.MCustomLaunchArgs.KernelIsCooperative) &&
(!static_cast<const CGExecKernel &>(CG)
.MCustomLaunchArgs.KernelUsesClusterLaunch);
}
#endif // __INTEL_PREVIEW_BREAKING_CHANGES

Expand Down Expand Up @@ -2405,8 +2407,7 @@ static ur_result_t SetKernelParamsAndLaunch(
std::vector<ur_event_handle_t> &RawEvents, detail::event_impl *OutEventImpl,
const KernelArgMask *EliminatedArgMask,
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
bool IsCooperative, bool KernelUsesClusterLaunch,
uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage,
CustomLaunchArguments CustomLuanchArgs, const RTDeviceBinaryImage *BinImage,
KernelNameStrRefT KernelName,
KernelNameBasedCacheT *KernelNameBasedCachePtr,
void *KernelFuncPtr = nullptr, int KernelNumArgs = 0,
Expand Down Expand Up @@ -2465,7 +2466,8 @@ static ur_result_t SetKernelParamsAndLaunch(
// this indicates the buffer is actually unused and was elided.
if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) {
Adapter.call<UrApiKind::urKernelSetArgLocal>(
Kernel, ImplicitLocalArg.value(), WorkGroupMemorySize, nullptr);
Kernel, ImplicitLocalArg.value(),
CustomLuanchArgs.KernelWorkGroupMemorySize, nullptr);
}

adjustNDRangePerKernel(NDRDesc, Kernel, Queue.getDeviceImpl());
Expand Down Expand Up @@ -2499,7 +2501,7 @@ static ur_result_t SetKernelParamsAndLaunch(

std::vector<ur_kernel_launch_property_t> property_list;

if (KernelUsesClusterLaunch) {
if (CustomLuanchArgs.KernelUsesClusterLaunch) {
ur_kernel_launch_property_value_t launch_property_value_cluster_range;
launch_property_value_cluster_range.clusterDim[0] =
NDRDesc.ClusterDimensions[0];
Expand All @@ -2511,16 +2513,19 @@ static ur_result_t SetKernelParamsAndLaunch(
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
launch_property_value_cluster_range});
}
if (IsCooperative) {
if (CustomLuanchArgs.KernelIsCooperative) {
ur_kernel_launch_property_value_t launch_property_value_cooperative;
launch_property_value_cooperative.cooperative = 1;
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE,
launch_property_value_cooperative});
}
// If there is no implicit arg, let the driver handle it via a property
if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) {
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY,
{{WorkGroupMemorySize}}});
if (CustomLuanchArgs.KernelWorkGroupMemorySize &&
!ImplicitLocalArg.has_value()) {
property_list.push_back(
{UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY,
{{static_cast<uint32_t>(
CustomLuanchArgs.KernelWorkGroupMemorySize)}}});
}
ur_event_handle_t UREvent = nullptr;
ur_result_t Error = Adapter.call_nocheck<UrApiKind::urEnqueueKernelLaunch>(
Expand Down Expand Up @@ -2681,9 +2686,9 @@ void enqueueImpKernel(
KernelNameBasedCacheT *KernelNameBasedCachePtr,
std::vector<ur_event_handle_t> &RawEvents, detail::event_impl *OutEventImpl,
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative,
const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr, int KernelNumArgs,
ur_kernel_cache_config_t KernelCacheConfig,
CustomLaunchArguments CustomLaunchArgs, const RTDeviceBinaryImage *BinImage,
void *KernelFuncPtr, int KernelNumArgs,
detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
bool KernelHasSpecialCaptures) {
// Run OpenCL kernel
Expand Down Expand Up @@ -2770,8 +2775,7 @@ void enqueueImpKernel(

Error = SetKernelParamsAndLaunch(
Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize,
OutEventImpl, EliminatedArgMask, getMemAllocationFunc, CustomLaunchArgs,
BinImage, KernelName, KernelNameBasedCachePtr, KernelFuncPtr,
KernelNumArgs, KernelParamDescGetter, KernelHasSpecialCaptures);
}
Expand Down Expand Up @@ -3264,13 +3268,12 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
BinImage = retrieveKernelBinary(*MQueue, KernelName);
assert(BinImage && "Failed to obtain a binary image.");
}
enqueueImpKernel(
*MQueue, NDRDesc, Args, ExecKernel->getKernelBundle().get(),
SyclKernel.get(), KernelName, ExecKernel->MKernelNameBasedCachePtr,
RawEvents, EventImpl, getMemAllocationFunc,
ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative,
ExecKernel->MKernelUsesClusterLaunch,
ExecKernel->MKernelWorkGroupMemorySize, BinImage);
enqueueImpKernel(*MQueue, NDRDesc, Args,
ExecKernel->getKernelBundle().get(), SyclKernel.get(),
KernelName, ExecKernel->MKernelNameBasedCachePtr,
RawEvents, EventImpl, getMemAllocationFunc,
ExecKernel->MKernelCacheConfig,
ExecKernel->MCustomLaunchArgs, BinImage);

return UR_RESULT_SUCCESS;
}
Expand Down
4 changes: 2 additions & 2 deletions sycl/source/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,8 +633,8 @@ void enqueueImpKernel(
KernelNameBasedCacheT *KernelNameBasedCachePtr,
std::vector<ur_event_handle_t> &RawEvents, detail::event_impl *OutEventImpl,
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative,
const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize,
ur_kernel_cache_config_t KernelCacheConfig,
CustomLaunchArguments CustomLaunchArgs,
const RTDeviceBinaryImage *BinImage = nullptr,
void *KernelFuncPtr = nullptr, int KernelNumArgs = 0,
detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr,
Expand Down
19 changes: 8 additions & 11 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,8 +605,7 @@ event handler::finalize() {
impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr,
MKernel.get(), toKernelNameStrT(MKernelName),
impl->MKernelNameBasedCachePtr, RawEvents, ResultEvent.get(),
nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative,
impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
nullptr, impl->MKernelCacheConfig, impl->MCustomLaunchArgs,
BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs,
impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures);
#ifdef XPTI_ENABLE_INSTRUMENTATION
Expand Down Expand Up @@ -667,9 +666,7 @@ event handler::finalize() {
std::move(impl->MArgs), toKernelNameStrT(MKernelName),
impl->MKernelNameBasedCachePtr, std::move(MStreamStorage),
std::move(impl->MAuxiliaryResources), getType(),
impl->MKernelCacheConfig, impl->MKernelIsCooperative,
impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize,
MCodeLoc));
impl->MKernelCacheConfig, impl->MCustomLaunchArgs, MCodeLoc));
break;
}
case detail::CGType::CopyAccToPtr:
Expand Down Expand Up @@ -2240,15 +2237,15 @@ void handler::setKernelCacheConfig(handler::StableKernelCacheConfig Config) {
}

void handler::setKernelIsCooperative(bool KernelIsCooperative) {
impl->MKernelIsCooperative = KernelIsCooperative;
impl->MCustomLaunchArgs.KernelIsCooperative = KernelIsCooperative;
}

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) {
throwIfGraphAssociated<
syclex::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_experimental_cuda_cluster_launch>();
impl->MKernelUsesClusterLaunch = true;
impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true;

if (Dims == 1) {
sycl::range<1> ClusterSizeTrimmed = {ClusterSize[0]};
Expand All @@ -2266,30 +2263,30 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize) {
throwIfGraphAssociated<
syclex::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_experimental_cuda_cluster_launch>();
impl->MKernelUsesClusterLaunch = true;
impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true;
impl->MNDRDesc.setClusterDimensions(ClusterSize);
}

void handler::setKernelClusterLaunch(sycl::range<2> ClusterSize) {
throwIfGraphAssociated<
syclex::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_experimental_cuda_cluster_launch>();
impl->MKernelUsesClusterLaunch = true;
impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true;
impl->MNDRDesc.setClusterDimensions(ClusterSize);
}

void handler::setKernelClusterLaunch(sycl::range<1> ClusterSize) {
throwIfGraphAssociated<
syclex::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_experimental_cuda_cluster_launch>();
impl->MKernelUsesClusterLaunch = true;
impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true;
impl->MNDRDesc.setClusterDimensions(ClusterSize);
}

void handler::setKernelWorkGroupMem(size_t Size) {
throwIfGraphAssociated<syclex::detail::UnsupportedGraphFeatures::
sycl_ext_oneapi_work_group_scratch_memory>();
impl->MKernelWorkGroupMemorySize = Size;
impl->MCustomLaunchArgs.KernelWorkGroupMemorySize = Size;
}

void handler::ext_oneapi_graph(
Expand Down
Loading