Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ void single_task(queue Q, const KernelType &KernelObj,
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
detail::submit_kernel_direct_single_task<KernelName>(
std::move(Q), KernelObj, empty_properties_t{}, CodeLoc);
std::move(Q), KernelObj, {}, empty_properties_t{}, CodeLoc);
} else {
submit(
std::move(Q),
Expand Down Expand Up @@ -274,7 +274,7 @@ void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dimensions>>::value)) {
detail::submit_kernel_direct_parallel_for<KernelName>(std::move(Q), Range,
KernelObj);
KernelObj, {});
} else {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
Expand Down Expand Up @@ -312,7 +312,7 @@ void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
LaunchConfigAccess(Config);

detail::submit_kernel_direct_parallel_for<KernelName>(
std::move(Q), LaunchConfigAccess.getRange(), KernelObj,
std::move(Q), LaunchConfigAccess.getRange(), KernelObj, {},
LaunchConfigAccess.getProperties());
} else {
submit(std::move(Q), [&](handler &CGH) {
Expand Down
8 changes: 4 additions & 4 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<1>>::value)) {
detail::submit_kernel_direct_parallel_for(q, nd_range<1>(r, size),
std::forward<KernelType>(k));
std::forward<KernelType>(k), {});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should set a default value for DepEvents in submit_kernel_direct_parallel_for and submit_kernel_direct_single_task, so that we can avoid passing {} everywhere.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That doesn't matter. Once handler-less path "implementation" finishes, next step would be to clean up the codebase and that would include dropping it completely and re-writing handler path to delegate to the public handler-less APIs to avoid extra level of templates.

} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
Expand All @@ -179,7 +179,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<2>>::value)) {
detail::submit_kernel_direct_parallel_for(q, nd_range<2>(r, size),
std::forward<KernelType>(k));
std::forward<KernelType>(k), {});
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
Expand All @@ -196,7 +196,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<3>>::value)) {
detail::submit_kernel_direct_parallel_for(q, nd_range<3>(r, size),
std::forward<KernelType>(k));
std::forward<KernelType>(k), {});
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
Expand Down Expand Up @@ -317,7 +317,7 @@ void launch_task(const sycl::queue &q, KernelType &&k,
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
detail::submit_kernel_direct_single_task(
q, std::forward<KernelType>(k),
q, std::forward<KernelType>(k), {},
ext::oneapi::experimental::empty_properties_t{}, codeLoc);
} else {
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
Expand Down
128 changes: 90 additions & 38 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <sycl/nd_range.hpp> // for nd_range
#include <sycl/property_list.hpp> // for property_list
#include <sycl/range.hpp> // for range
#include <sycl/sycl_span.hpp> // for sycl::span

#include <cstddef> // for size_t
#include <functional> // for function
Expand Down Expand Up @@ -68,6 +69,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
sycl::span<const event> DepEvents,
const detail::KernelPropertyHolderStructTy &Props,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

Expand All @@ -76,6 +78,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
sycl::span<const event> DepEvents,
const detail::KernelPropertyHolderStructTy &Props,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

Expand Down Expand Up @@ -165,7 +168,7 @@ template <detail::WrapAs WrapAs, typename LambdaArgType,
typename KernelTypeUniversalRef, int Dims>
auto submit_kernel_direct(
const queue &Queue, const nd_range<Dims> &Range,
KernelTypeUniversalRef &&KernelFunc,
KernelTypeUniversalRef &&KernelFunc, sycl::span<const event> DepEvents,
const PropertiesT &ExtraProps =
ext::oneapi::experimental::empty_properties_t{},
const detail::code_location &CodeLoc = detail::code_location::current()) {
Expand Down Expand Up @@ -230,12 +233,14 @@ auto submit_kernel_direct(

if constexpr (EventNeeded) {
return submit_kernel_direct_with_event_impl(
Queue, Range, HostKernel, DeviceKernelInfoPtr, ParsedProperties,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
ParsedProperties, TlsCodeLocCapture.query(),
TlsCodeLocCapture.isToplevel());
} else {
submit_kernel_direct_without_event_impl(
Queue, Range, HostKernel, DeviceKernelInfoPtr, ParsedProperties,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
ParsedProperties, TlsCodeLocCapture.query(),
TlsCodeLocCapture.isToplevel());
}
}

Expand All @@ -244,7 +249,7 @@ template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename KernelTypeUniversalRef, int Dims>
auto submit_kernel_direct_parallel_for(
const queue &Queue, const nd_range<Dims> &Range,
KernelTypeUniversalRef &&KernelFunc,
KernelTypeUniversalRef &&KernelFunc, sycl::span<const event> DepEvents,
const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{},
const detail::code_location &CodeLoc = detail::code_location::current()) {

Expand All @@ -266,23 +271,25 @@ auto submit_kernel_direct_parallel_for(
return submit_kernel_direct<detail::WrapAs::parallel_for, TransformedArgType,
KernelName, EventNeeded, PropertiesT,
KernelTypeUniversalRef, Dims>(
Queue, Range, std::forward<KernelTypeUniversalRef>(KernelFunc), Props,
CodeLoc);
Queue, Range, std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents,
Props, CodeLoc);
}

template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename PropertiesT = ext::oneapi::experimental::empty_properties_t,
typename KernelTypeUniversalRef>
auto submit_kernel_direct_single_task(
const queue &Queue, KernelTypeUniversalRef &&KernelFunc,
sycl::span<const event> DepEvents,
const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{},
const detail::code_location &CodeLoc = detail::code_location::current()) {

return submit_kernel_direct<detail::WrapAs::single_task, void, KernelName,
EventNeeded, PropertiesT, KernelTypeUniversalRef,
1>(
Queue, nd_range<1>{1, 1},
std::forward<KernelTypeUniversalRef>(KernelFunc), Props, CodeLoc);
std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, Props,
CodeLoc);
}

} // namespace detail
Expand Down Expand Up @@ -2802,7 +2809,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
return detail::submit_kernel_direct_single_task<KernelName, true>(
*this, KernelFunc, Properties, TlsCodeLocCapture.query());
*this, KernelFunc, {}, Properties, TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
Expand Down Expand Up @@ -2852,13 +2859,23 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
"Use queue.submit() instead");

detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());

// TODO The handler-less path does not support kernel functions
// with the kernel_handler type argument yet.
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
return detail::submit_kernel_direct_single_task<KernelName, true>(
*this, KernelFunc, sycl::span<const event>(&DepEvent, 1), Properties,
TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());
}
}

/// single_task version with a kernel represented as a lambda.
Expand Down Expand Up @@ -2903,13 +2920,22 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
"Use queue.submit() instead");

detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());

// TODO The handler-less path does not support kernel functions
// with the kernel_handler type argument yet.
if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
return detail::submit_kernel_direct_single_task<KernelName, true>(
*this, KernelFunc, DepEvents, Properties, TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());
}
}

/// single_task version with a kernel represented as a lambda.
Expand Down Expand Up @@ -3348,7 +3374,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
KernelType, sycl::nd_item<Dims>>::value)) {

return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, Range, Rest..., Properties, TlsCodeLocCapture.query());
*this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query());
} else
return submit(
[&](handler &CGH) {
Expand Down Expand Up @@ -3377,7 +3403,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dims>>::value)) {
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, Range, Rest...,
*this, Range, Rest..., {},
ext::oneapi::experimental::empty_properties_t{},
TlsCodeLocCapture.query());
} else {
Expand Down Expand Up @@ -3431,12 +3457,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
parallel_for(nd_range<Dims> Range, event DepEvent, RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;

// TODO The handler-less path does not support reductions, and
// kernel functions with the kernel_handler type argument yet.
if constexpr (sizeof...(RestT) == 1 &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dims>>::value)) {
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, Range, Rest..., sycl::span<const event>(&DepEvent, 1),
ext::oneapi::experimental::empty_properties_t{},
TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
}
}

/// parallel_for version with a kernel represented as a lambda + nd_range that
Expand Down Expand Up @@ -3485,12 +3524,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;

// TODO The handler-less path does not support reductions, and
// kernel functions with the kernel_handler type argument yet.
if constexpr (sizeof...(RestT) == 1 &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dims>>::value)) {
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, Range, Rest..., DepEvents,
ext::oneapi::experimental::empty_properties_t{},
TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
}
}

/// Copies data from a memory region pointed to by a placeholder accessor to
Expand Down
Loading