diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index f48f6ace13ddd..f3a22d5131bb0 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -154,6 +154,11 @@ class NDRDescT { std::array ClusterDimensions{1, 1, 1}; size_t Dims = 0; }; +struct CustomLaunchArguments { + size_t KernelWorkGroupMemorySize = 0; + bool KernelIsCooperative = false; + bool KernelUsesClusterLaunch = false; +}; /// Base class for all types of command groups. class CG { @@ -261,9 +266,7 @@ class CGExecKernel : public CG { /// of command-groups that a kernel command can be updated to. std::vector> MAlternativeKernels; ur_kernel_cache_config_t MKernelCacheConfig; - bool MKernelIsCooperative = false; - bool MKernelUsesClusterLaunch = false; - size_t MKernelWorkGroupMemorySize = 0; + CustomLaunchArguments MCustomLaunchArgs; CGExecKernel(NDRDescT NDRDesc, std::shared_ptr HKernel, std::shared_ptr SyclKernel, @@ -274,8 +277,8 @@ class CGExecKernel : public CG { std::vector> Streams, std::vector> AuxiliaryResources, CGType Type, ur_kernel_cache_config_t KernelCacheConfig, - bool KernelIsCooperative, bool MKernelUsesClusterLaunch, - size_t KernelWorkGroupMemorySize, detail::code_location loc = {}) + CustomLaunchArguments CustomLaunchArgs, + detail::code_location loc = {}) : CG(Type, std::move(CGData), std::move(loc)), MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), @@ -285,9 +288,7 @@ class CGExecKernel : public CG { MStreams(std::move(Streams)), MAuxiliaryResources(std::move(AuxiliaryResources)), MAlternativeKernels{}, MKernelCacheConfig(std::move(KernelCacheConfig)), - MKernelIsCooperative(KernelIsCooperative), - MKernelUsesClusterLaunch(MKernelUsesClusterLaunch), - MKernelWorkGroupMemorySize(KernelWorkGroupMemorySize) { + MCustomLaunchArgs(CustomLaunchArgs) { assert(getType() == CGType::Kernel && "Wrong type of exec kernel CG."); } diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 0fda3dd4f2769..94818779f0885 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -106,9 +106,9 @@ class handler_impl { ur_kernel_cache_config_t MKernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT; - bool MKernelIsCooperative = false; - bool MKernelUsesClusterLaunch = false; - uint32_t MKernelWorkGroupMemorySize = 0; + // Extra information for custom kernel launch passed through property + // processing. + CustomLaunchArguments MCustomLaunchArgs = {}; // Extra information for bindless image copy ur_image_desc_t MSrcImageDesc = {}; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 6709bf0d9ac19..9294cc11179d1 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -321,8 +321,10 @@ bool Command::isFusable() const { } const auto &CG = (static_cast(*this)).getCG(); return (CG.getType() == CGType::Kernel) && - (!static_cast(CG).MKernelIsCooperative) && - (!static_cast(CG).MKernelUsesClusterLaunch); + (!static_cast(CG) + .MCustomLaunchArgs.KernelIsCooperative) && + (!static_cast(CG) + .MCustomLaunchArgs.KernelUsesClusterLaunch); } #endif // __INTEL_PREVIEW_BREAKING_CHANGES @@ -2405,8 +2407,7 @@ static ur_result_t SetKernelParamsAndLaunch( std::vector &RawEvents, detail::event_impl *OutEventImpl, const KernelArgMask *EliminatedArgMask, const std::function &getMemAllocationFunc, - bool IsCooperative, bool KernelUsesClusterLaunch, - uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage, + CustomLaunchArguments CustomLuanchArgs, const RTDeviceBinaryImage *BinImage, KernelNameStrRefT KernelName, KernelNameBasedCacheT *KernelNameBasedCachePtr, void *KernelFuncPtr = nullptr, int KernelNumArgs = 0, @@ -2465,7 +2466,8 @@ static ur_result_t SetKernelParamsAndLaunch( // this indicates the buffer is actually unused and was elided. if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) { Adapter.call( - Kernel, ImplicitLocalArg.value(), WorkGroupMemorySize, nullptr); + Kernel, ImplicitLocalArg.value(), + CustomLuanchArgs.KernelWorkGroupMemorySize, nullptr); } adjustNDRangePerKernel(NDRDesc, Kernel, Queue.getDeviceImpl()); @@ -2499,7 +2501,7 @@ static ur_result_t SetKernelParamsAndLaunch( std::vector property_list; - if (KernelUsesClusterLaunch) { + if (CustomLuanchArgs.KernelUsesClusterLaunch) { ur_kernel_launch_property_value_t launch_property_value_cluster_range; launch_property_value_cluster_range.clusterDim[0] = NDRDesc.ClusterDimensions[0]; @@ -2511,16 +2513,19 @@ static ur_result_t SetKernelParamsAndLaunch( property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION, launch_property_value_cluster_range}); } - if (IsCooperative) { + if (CustomLuanchArgs.KernelIsCooperative) { ur_kernel_launch_property_value_t launch_property_value_cooperative; launch_property_value_cooperative.cooperative = 1; property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE, launch_property_value_cooperative}); } // If there is no implicit arg, let the driver handle it via a property - if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) { - property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, - {{WorkGroupMemorySize}}}); + if (CustomLuanchArgs.KernelWorkGroupMemorySize && + !ImplicitLocalArg.has_value()) { + property_list.push_back( + {UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, + {{static_cast( + CustomLuanchArgs.KernelWorkGroupMemorySize)}}}); } ur_event_handle_t UREvent = nullptr; ur_result_t Error = Adapter.call_nocheck( @@ -2681,9 +2686,9 @@ void enqueueImpKernel( KernelNameBasedCacheT *KernelNameBasedCachePtr, std::vector &RawEvents, detail::event_impl *OutEventImpl, const std::function &getMemAllocationFunc, - ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, - const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, - const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr, int KernelNumArgs, + ur_kernel_cache_config_t KernelCacheConfig, + CustomLaunchArguments CustomLaunchArgs, const RTDeviceBinaryImage *BinImage, + void *KernelFuncPtr, int KernelNumArgs, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), bool KernelHasSpecialCaptures) { // Run OpenCL kernel @@ -2770,8 +2775,7 @@ void enqueueImpKernel( Error = SetKernelParamsAndLaunch( Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, - OutEventImpl, EliminatedArgMask, getMemAllocationFunc, - KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize, + OutEventImpl, EliminatedArgMask, getMemAllocationFunc, CustomLaunchArgs, BinImage, KernelName, KernelNameBasedCachePtr, KernelFuncPtr, KernelNumArgs, KernelParamDescGetter, KernelHasSpecialCaptures); } @@ -3264,13 +3268,12 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { BinImage = retrieveKernelBinary(*MQueue, KernelName); assert(BinImage && "Failed to obtain a binary image."); } - enqueueImpKernel( - *MQueue, NDRDesc, Args, ExecKernel->getKernelBundle().get(), - SyclKernel.get(), KernelName, ExecKernel->MKernelNameBasedCachePtr, - RawEvents, EventImpl, getMemAllocationFunc, - ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative, - ExecKernel->MKernelUsesClusterLaunch, - ExecKernel->MKernelWorkGroupMemorySize, BinImage); + enqueueImpKernel(*MQueue, NDRDesc, Args, + ExecKernel->getKernelBundle().get(), SyclKernel.get(), + KernelName, ExecKernel->MKernelNameBasedCachePtr, + RawEvents, EventImpl, getMemAllocationFunc, + ExecKernel->MKernelCacheConfig, + ExecKernel->MCustomLaunchArgs, BinImage); return UR_RESULT_SUCCESS; } diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 892b33430b46c..9c1ceed536405 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -633,8 +633,8 @@ void enqueueImpKernel( KernelNameBasedCacheT *KernelNameBasedCachePtr, std::vector &RawEvents, detail::event_impl *OutEventImpl, const std::function &getMemAllocationFunc, - ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative, - const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, + ur_kernel_cache_config_t KernelCacheConfig, + CustomLaunchArguments CustomLaunchArgs, const RTDeviceBinaryImage *BinImage = nullptr, void *KernelFuncPtr = nullptr, int KernelNumArgs = 0, detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index dc5d2f9df6758..68ec91c8d831d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -605,8 +605,7 @@ event handler::finalize() { impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr, MKernel.get(), toKernelNameStrT(MKernelName), impl->MKernelNameBasedCachePtr, RawEvents, ResultEvent.get(), - nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative, - impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, + nullptr, impl->MKernelCacheConfig, impl->MCustomLaunchArgs, BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs, impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -667,9 +666,7 @@ event handler::finalize() { std::move(impl->MArgs), toKernelNameStrT(MKernelName), impl->MKernelNameBasedCachePtr, std::move(MStreamStorage), std::move(impl->MAuxiliaryResources), getType(), - impl->MKernelCacheConfig, impl->MKernelIsCooperative, - impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, - MCodeLoc)); + impl->MKernelCacheConfig, impl->MCustomLaunchArgs, MCodeLoc)); break; } case detail::CGType::CopyAccToPtr: @@ -2240,7 +2237,7 @@ void handler::setKernelCacheConfig(handler::StableKernelCacheConfig Config) { } void handler::setKernelIsCooperative(bool KernelIsCooperative) { - impl->MKernelIsCooperative = KernelIsCooperative; + impl->MCustomLaunchArgs.KernelIsCooperative = KernelIsCooperative; } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES @@ -2248,7 +2245,7 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { throwIfGraphAssociated< syclex::detail::UnsupportedGraphFeatures:: sycl_ext_oneapi_experimental_cuda_cluster_launch>(); - impl->MKernelUsesClusterLaunch = true; + impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true; if (Dims == 1) { sycl::range<1> ClusterSizeTrimmed = {ClusterSize[0]}; @@ -2266,7 +2263,7 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize) { throwIfGraphAssociated< syclex::detail::UnsupportedGraphFeatures:: sycl_ext_oneapi_experimental_cuda_cluster_launch>(); - impl->MKernelUsesClusterLaunch = true; + impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true; impl->MNDRDesc.setClusterDimensions(ClusterSize); } @@ -2274,7 +2271,7 @@ void handler::setKernelClusterLaunch(sycl::range<2> ClusterSize) { throwIfGraphAssociated< syclex::detail::UnsupportedGraphFeatures:: sycl_ext_oneapi_experimental_cuda_cluster_launch>(); - impl->MKernelUsesClusterLaunch = true; + impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true; impl->MNDRDesc.setClusterDimensions(ClusterSize); } @@ -2282,14 +2279,14 @@ void handler::setKernelClusterLaunch(sycl::range<1> ClusterSize) { throwIfGraphAssociated< syclex::detail::UnsupportedGraphFeatures:: sycl_ext_oneapi_experimental_cuda_cluster_launch>(); - impl->MKernelUsesClusterLaunch = true; + impl->MCustomLaunchArgs.KernelUsesClusterLaunch = true; impl->MNDRDesc.setClusterDimensions(ClusterSize); } void handler::setKernelWorkGroupMem(size_t Size) { throwIfGraphAssociated(); - impl->MKernelWorkGroupMemorySize = Size; + impl->MCustomLaunchArgs.KernelWorkGroupMemorySize = Size; } void handler::ext_oneapi_graph(