diff --git a/dali/pipeline/data/copy_to_external.h b/dali/pipeline/data/copy_to_external.h index 4ec1ed40da9..c0912584141 100644 --- a/dali/pipeline/data/copy_to_external.h +++ b/dali/pipeline/data/copy_to_external.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -48,7 +48,7 @@ inline void CopyToExternalImpl(void* dst, AccessOrder order, bool use_copy_kernel) { DeviceGuard d(src.device_id()); const auto &type_info = src.type_info(); - type_info.template Copy(dst, src.raw_data(), src.size(), order.stream(), + type_info.template Copy(dst, src.raw_data(), src.size(), order, use_copy_kernel); } @@ -70,7 +70,7 @@ inline void CopyToExternalImpl(void* dst, if (src.IsContiguous()) { type_info.template Copy(dst, unsafe_raw_data(src), src._num_elements(), - order.stream(), use_copy_kernel); + order, use_copy_kernel); } else { const auto &src_shape = src.shape(); BatchVector from; @@ -84,7 +84,7 @@ inline void CopyToExternalImpl(void* dst, } type_info.template Copy(dst, from.data(), sizes.data(), - num_samples, order.stream(), use_copy_kernel); + num_samples, order, use_copy_kernel); } } @@ -115,7 +115,7 @@ inline void CopyToExternalImpl(void** dsts, if (src.IsContiguous() && samples_to_copy == num_samples) { type_info.template Copy(dsts, unsafe_raw_data(src), sizes.data(), - num_samples, order.stream(), use_copy_kernel); + num_samples, order, use_copy_kernel); } else { BatchVector from; @@ -130,7 +130,7 @@ inline void CopyToExternalImpl(void** dsts, } type_info.template Copy( - to.data(), from.data(), sizes.data(), samples_to_copy, order.stream(), use_copy_kernel); + to.data(), from.data(), sizes.data(), samples_to_copy, order, use_copy_kernel); } } diff --git a/dali/pipeline/data/tensor.h b/dali/pipeline/data/tensor.h index 5c25939a3b4..1e38a5a070a 100644 --- a/dali/pipeline/data/tensor.h +++ b/dali/pipeline/data/tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -86,7 +86,7 @@ class Tensor : public Buffer { order.wait(order_); type_.template Copy(this->raw_mutable_data(), - data.data(), this->size(), order.stream()); + data.data(), this->size(), order); order_.wait(order); } @@ -102,7 +102,7 @@ class Tensor : public Buffer { order.wait(order_); type_.template Copy(this->raw_mutable_data(), - data.data(), this->size(), order.stream()); + data.data(), this->size(), order); order_.wait(order); } @@ -127,7 +127,7 @@ class Tensor : public Buffer { this->SetSourceInfo(other.GetSourceInfo()); this->SetSkipSample(other.ShouldSkipSample()); type_.template Copy(this->raw_mutable_data(), - other.raw_data(), this->size(), order.stream()); + other.raw_data(), this->size(), order); order_.wait(order); } diff --git a/dali/pipeline/data/tensor_list.cc b/dali/pipeline/data/tensor_list.cc index 8fd0d0f22a9..907e340404f 100644 --- a/dali/pipeline/data/tensor_list.cc +++ b/dali/pipeline/data/tensor_list.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -92,7 +92,7 @@ void CopySamplewiseImpl(DstBatch &dst, const SrcBatch &s } type_info.Copy(dsts.data(), srcs.data(), sizes.data(), num_samples, - order.stream(), use_copy_kernel); + order, use_copy_kernel); } @@ -114,7 +114,7 @@ void CopySamplewiseImpl(DstBatch &dst, const void *src, const TypeIn } type_info.Copy(dsts.data(), src, sizes.data(), num_samples, - order.stream(), use_copy_kernel); + order, use_copy_kernel); } @@ -136,7 +136,7 @@ void CopySamplewiseImpl(void *dst, const SrcBatch &src, const TypeIn } type_info.Copy(dst, srcs.data(), sizes.data(), num_samples, - order.stream(), use_copy_kernel); + order, use_copy_kernel); } /** @@ -149,7 +149,7 @@ void CopyImpl(DstBatch &dst, const SrcBatch &src, const AccessOrder copy_order, bool use_copy_kernel = false) { if (dst.IsContiguous() && src.IsContiguous()) { type_info.Copy(unsafe_raw_mutable_data(dst), unsafe_raw_data(src), - dst.shape().num_elements(), copy_order.stream(), + dst.shape().num_elements(), copy_order, use_copy_kernel); } else if (dst.IsContiguous() && !src.IsContiguous()) { copy_impl::CopySamplewiseImpl(unsafe_raw_mutable_data(dst), src, diff --git a/dali/pipeline/data/types.cc b/dali/pipeline/data/types.cc index 1ce5fd654cb..950213d64fc 100644 --- a/dali/pipeline/data/types.cc +++ b/dali/pipeline/data/types.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ const auto &_type_info_##Id = TypeTable::GetTypeId() #include "dali/pipeline/data/backend.h" #include "dali/core/per_stream_pool.h" +#include "dali/core/cuda_stream_pool.h" #include "dali/kernels/common/scatter_gather.h" namespace dali { @@ -45,36 +46,54 @@ ScatterGatherPool& ScatterGatherPoolInstance() { } void ScatterGatherCopy(void **dsts, const void **srcs, const Index *sizes, int n, int element_size, - cudaStream_t stream) { - auto sc = ScatterGatherPoolInstance().Get(stream, kMaxSizePerBlock); + AccessOrder order) { + if (!order.is_device()) { + CUDAStreamLease stream = CUDAStreamPool::instance().Get(); + ScatterGatherCopy(dsts, srcs, sizes, n, element_size, stream); + AccessOrder::host().wait(stream); + return; + } + auto sc = ScatterGatherPoolInstance().Get(order.stream(), kMaxSizePerBlock); for (int i = 0; i < n; i++) { sc->AddCopy(dsts[i], srcs[i], sizes[i] * element_size); } - sc->Run(stream, true, kernels::ScatterGatherGPU::Method::Kernel); + sc->Run(order.stream(), true, kernels::ScatterGatherGPU::Method::Kernel); } void ScatterGatherCopy(void *dst, const void **srcs, const Index *sizes, int n, int element_size, - cudaStream_t stream) { - auto sc = ScatterGatherPoolInstance().Get(stream, kMaxSizePerBlock); + AccessOrder order) { + if (!order.is_device()) { + CUDAStreamLease stream = CUDAStreamPool::instance().Get(); + ScatterGatherCopy(dst, srcs, sizes, n, element_size, stream); + AccessOrder::host().wait(stream); + return; + } + auto sc = ScatterGatherPoolInstance().Get(order.stream(), kMaxSizePerBlock); auto *sample_dst = reinterpret_cast(dst); for (int i = 0; i < n; i++) { auto nbytes = sizes[i] * element_size; sc->AddCopy(sample_dst, srcs[i], nbytes); sample_dst += nbytes; } - sc->Run(stream, true, kernels::ScatterGatherGPU::Method::Kernel); + sc->Run(order.stream(), true, kernels::ScatterGatherGPU::Method::Kernel); } void ScatterGatherCopy(void **dsts, const void *src, const Index *sizes, int n, int element_size, - cudaStream_t stream) { - auto sc = ScatterGatherPoolInstance().Get(stream, kMaxSizePerBlock); + AccessOrder order) { + if (!order.is_device()) { + CUDAStreamLease stream = CUDAStreamPool::instance().Get(); + ScatterGatherCopy(dsts, src, sizes, n, element_size, stream); + AccessOrder::host().wait(stream); + return; + } + auto sc = ScatterGatherPoolInstance().Get(order.stream(), kMaxSizePerBlock); auto *sample_src = reinterpret_cast(src); for (int i = 0; i < n; i++) { auto nbytes = sizes[i] * element_size; sc->AddCopy(dsts[i], sample_src, nbytes); sample_src += nbytes; } - sc->Run(stream, true, kernels::ScatterGatherGPU::Method::Kernel); + sc->Run(order.stream(), true, kernels::ScatterGatherGPU::Method::Kernel); } } // namespace detail @@ -86,75 +105,87 @@ TypeTable &TypeTable::instance() { template void TypeInfo::Copy(void *dst, - const void *src, Index n, cudaStream_t stream, bool use_copy_kernel) const { + const void *src, Index n, AccessOrder order, bool use_copy_kernel) const { constexpr bool is_host_to_host = std::is_same::value && std::is_same::value; if (n == 0) return; if (is_host_to_host) { - if (stream) + if (order.is_device()) throw std::logic_error("Cannot issue a H2H copy on a stream"); // Call our copy function copier_(dst, src, n); } else if (use_copy_kernel) { - detail::LaunchCopyKernel(dst, src, n * size(), stream); + if (order.is_device()) { + detail::LaunchCopyKernel(dst, src, n * size(), order.stream()); + } else { + CUDAStreamLease stream = CUDAStreamPool::instance().Get(); + detail::LaunchCopyKernel(dst, src, n * size(), stream); + CUDA_CALL(cudaStreamSynchronize(stream)); + } } else { - MemCopy(dst, src, n*size(), stream); + if (order.is_device()) { + MemCopy(dst, src, n*size(), order.stream()); + } else { + CUDAStreamLease stream = CUDAStreamPool::instance().Get(); + MemCopy(dst, src, n*size(), stream); + CUDA_CALL(cudaStreamSynchronize(stream)); + } } } template void TypeInfo::Copy(void *dst, - const void *src, Index n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, Index n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void *dst, - const void *src, Index n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, Index n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void *dst, - const void *src, Index n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, Index n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void *dst, - const void *src, Index n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, Index n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, const void** srcs, const Index* sizes, int n, - cudaStream_t stream, bool use_copy_kernel) const { + AccessOrder order, bool use_copy_kernel) const { constexpr bool is_host_to_host = std::is_same::value && std::is_same::value; if (!is_host_to_host && use_copy_kernel) { - detail::ScatterGatherCopy(dsts, srcs, sizes, n, size(), stream); + detail::ScatterGatherCopy(dsts, srcs, sizes, n, size(), order); } else { for (int i = 0; i < n; i++) { - Copy(dsts[i], srcs[i], sizes[i], stream); + Copy(dsts[i], srcs[i], sizes[i], order); } } } template void TypeInfo::Copy(void **dsts, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void *dst, const void** srcs, const Index* sizes, int n, - cudaStream_t stream, bool use_copy_kernel) const { + AccessOrder order, bool use_copy_kernel) const { constexpr bool is_host_to_host = std::is_same::value && std::is_same::value; if (!is_host_to_host && use_copy_kernel) { - detail::ScatterGatherCopy(dst, srcs, sizes, n, size(), stream); + detail::ScatterGatherCopy(dst, srcs, sizes, n, size(), order); } else { auto sample_dst = static_cast(dst); for (int i = 0; i < n; i++) { - Copy(sample_dst, srcs[i], sizes[i], stream); + Copy(sample_dst, srcs[i], sizes[i], order); sample_dst += sizes[i] * size(); } } @@ -162,44 +193,44 @@ void TypeInfo::Copy(void *dst, const void** srcs, const Index* sizes, int n, template void TypeInfo::Copy(void *dst, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void *dst, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void *dst, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void *dst, - const void **src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void **src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, const void* src, const Index* sizes, int n, - cudaStream_t stream, bool use_copy_kernel) const { + AccessOrder order, bool use_copy_kernel) const { constexpr bool is_host_to_host = std::is_same::value && std::is_same::value; if (!is_host_to_host && use_copy_kernel) { - detail::ScatterGatherCopy(dsts, src, sizes, n, size(), stream); + detail::ScatterGatherCopy(dsts, src, sizes, n, size(), order); } else { auto sample_src = reinterpret_cast(src); for (int i = 0; i < n; i++) { - Copy(dsts[i], sample_src, sizes[i], stream); + Copy(dsts[i], sample_src, sizes[i], order); sample_src += sizes[i] * size(); } } } template void TypeInfo::Copy(void **dsts, - const void *src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, - const void *src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, - const void *src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; template void TypeInfo::Copy(void **dsts, - const void *src, const Index *sizes, int n, cudaStream_t stream, bool use_copy_kernel) const; + const void *src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel) const; } // namespace dali diff --git a/dali/pipeline/data/types.h b/dali/pipeline/data/types.h index 79c5c4bdce6..3eed9a590f3 100644 --- a/dali/pipeline/data/types.h +++ b/dali/pipeline/data/types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include #include +#include "dali/core/access_order.h" #include "dali/core/common.h" #include "dali/core/spinlock.h" #include "dali/core/float16.h" @@ -344,7 +345,7 @@ class DLL_PUBLIC TypeInfo { * (only relevant for device and host pinned memory) */ template - DLL_PUBLIC void Copy(void *dst, const void *src, Index n, cudaStream_t stream, + DLL_PUBLIC void Copy(void *dst, const void *src, Index n, AccessOrder order, bool use_copy_kernel = false) const; /** @@ -359,7 +360,7 @@ class DLL_PUBLIC TypeInfo { */ template DLL_PUBLIC void Copy(void **dst, const void **srcs, const Index *sizes, int n, - cudaStream_t stream, bool use_copy_kernel = false) const; + AccessOrder order, bool use_copy_kernel = false) const; /** * @brief Copies from SrcBackend scattered locations to a contiguous DstBackend buffer @@ -372,7 +373,7 @@ class DLL_PUBLIC TypeInfo { * (only relevant for device and host pinned memory) */ template - DLL_PUBLIC void Copy(void *dst, const void **srcs, const Index *sizes, int n, cudaStream_t stream, + DLL_PUBLIC void Copy(void *dst, const void **srcs, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel = false) const; /** @@ -386,7 +387,7 @@ class DLL_PUBLIC TypeInfo { * (only relevant for device and host pinned memory) */ template - DLL_PUBLIC void Copy(void **dsts, const void *src, const Index *sizes, int n, cudaStream_t stream, + DLL_PUBLIC void Copy(void **dsts, const void *src, const Index *sizes, int n, AccessOrder order, bool use_copy_kernel = false) const; DLL_PUBLIC inline DALIDataType id() const { diff --git a/dali/test/plugins/dummy/dummy.cc b/dali/test/plugins/dummy/dummy.cc index fe3febd4639..d0fe0789019 100644 --- a/dali/test/plugins/dummy/dummy.cc +++ b/dali/test/plugins/dummy/dummy.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,7 +29,8 @@ void Dummy<::dali::CPUBackend>::RunImpl(::dali::Workspace &ws) { [&, sample_id](int thread_id) { type.Copy<::dali::CPUBackend, ::dali::CPUBackend>(output.raw_mutable_tensor(sample_id), input.raw_tensor(sample_id), - in_shape.tensor_size(sample_id), 0); + in_shape.tensor_size(sample_id), + ::dali::AccessOrder::host()); }, in_shape.tensor_size(sample_id)); } diff --git a/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb b/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb index f938fca23be..94a190be59f 100644 --- a/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb +++ b/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb @@ -133,7 +133,8 @@ " type.Copy<::dali::CPUBackend, ::dali::CPUBackend>(\n", " output.raw_mutable_tensor(sample_id),\n", " input.raw_tensor(sample_id),\n", - " in_shape.tensor_size(sample_id), 0);\n", + " in_shape.tensor_size(sample_id),\n", + " ::dali::AccessOrder::host());\n", " },\n", " in_shape.tensor_size(sample_id));\n", " }\n", diff --git a/docs/examples/custom_operations/custom_operator/customdummy/dummy.cc b/docs/examples/custom_operations/custom_operator/customdummy/dummy.cc index 8733ab01816..f9f2046d365 100644 --- a/docs/examples/custom_operations/custom_operator/customdummy/dummy.cc +++ b/docs/examples/custom_operations/custom_operator/customdummy/dummy.cc @@ -16,7 +16,8 @@ void Dummy<::dali::CPUBackend>::RunImpl(::dali::Workspace &ws) { type.Copy<::dali::CPUBackend, ::dali::CPUBackend>( output.raw_mutable_tensor(sample_id), input.raw_tensor(sample_id), - in_shape.tensor_size(sample_id), 0); + in_shape.tensor_size(sample_id), + ::dali::AccessOrder::host()); }, in_shape.tensor_size(sample_id)); }