Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,18 +262,27 @@ fill_copy_args(detail::handler_impl *impl,
auto ZCopyExtentComponent = detail::fill_image_type(SrcImgDesc, UrSrcDesc);
detail::fill_image_type(DestImgDesc, UrDestDesc);

impl->MSrcOffset = {SrcOffset[0], SrcOffset[1], SrcOffset[2]};
impl->MDestOffset = {DestOffset[0], DestOffset[1], DestOffset[2]};
// Copy args computed here are directly passed to UR. Various offsets and
// extents end up passed as ur_rect_offset_t and ur_rect_region_t. Both those
// structs expect theirfirst component to be in bytes, not in pixels
size_t SrcPixelSize = SrcImgDesc.num_channels * get_channel_size(SrcImgDesc);
size_t DestPixelSize =
DestImgDesc.num_channels * get_channel_size(DestImgDesc);

impl->MSrcOffset = {SrcOffset[0] * SrcPixelSize, SrcOffset[1], SrcOffset[2]};
impl->MDestOffset = {DestOffset[0] * DestPixelSize, DestOffset[1],
DestOffset[2]};
impl->MSrcImageDesc = UrSrcDesc;
impl->MDstImageDesc = UrDestDesc;
impl->MSrcImageFormat = UrSrcFormat;
impl->MDstImageFormat = UrDestFormat;
impl->MImageCopyFlags = ImageCopyFlags;

if (CopyExtent.size() != 0) {
impl->MCopyExtent = {CopyExtent[0], CopyExtent[1], CopyExtent[2]};
impl->MCopyExtent = {CopyExtent[0] * SrcPixelSize, CopyExtent[1],
CopyExtent[2]};
} else {
impl->MCopyExtent = {SrcImgDesc.width, SrcImgDesc.height,
impl->MCopyExtent = {SrcImgDesc.width * SrcPixelSize, SrcImgDesc.height,
ZCopyExtentComponent};
}

Expand Down
98 changes: 47 additions & 51 deletions unified-runtime/source/adapters/cuda/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
};

unsigned int NumChannels = 0;
size_t PixelSizeBytes = 0;
[[maybe_unused]] size_t PixelSizeBytes = 0;

UR_CALL(urCalculateNumChannels(pSrcImageFormat->channelOrder, &NumChannels));

Expand Down Expand Up @@ -673,19 +673,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
(CUdeviceptr)pDst) != CUDA_SUCCESS;

size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
const char *SrcWithOffset = static_cast<const char *>(pSrc) +
(pCopyRegion->srcOffset.x * PixelSizeBytes);
size_t CopyExtentBytes = pCopyRegion->copyExtent.width;
const char *SrcWithOffset =
static_cast<const char *>(pSrc) + pCopyRegion->srcOffset.x;

if (isCudaArray) {
UR_CHECK_ERROR(cuMemcpyHtoAAsync(
(CUarray)pDst, pCopyRegion->dstOffset.x * PixelSizeBytes,
static_cast<const void *>(SrcWithOffset), CopyExtentBytes,
Stream));
UR_CHECK_ERROR(
cuMemcpyHtoAAsync((CUarray)pDst, pCopyRegion->dstOffset.x,
static_cast<const void *>(SrcWithOffset),
CopyExtentBytes, Stream));
} else if (memType == CU_MEMORYTYPE_DEVICE) {
void *DstWithOffset =
static_cast<void *>(static_cast<char *>(pDst) +
(PixelSizeBytes * pCopyRegion->dstOffset.x));
void *DstWithOffset = static_cast<void *>(static_cast<char *>(pDst) +
pCopyRegion->dstOffset.x);
UR_CHECK_ERROR(
cuMemcpyHtoDAsync((CUdeviceptr)DstWithOffset,
static_cast<const void *>(SrcWithOffset),
Expand All @@ -698,11 +697,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.srcHost = pSrc;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = pCopyRegion->copyExtent.height;
cpy_desc.srcPitch = pSrcImageDesc->rowPitch;
if (pDstImageDesc->rowPitch == 0) {
Expand All @@ -717,10 +716,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pDstImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.srcZ = pCopyRegion->srcOffset.z;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
Expand All @@ -729,18 +728,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.srcHeight = pSrcImageDesc->height;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = pCopyRegion->copyExtent.height;
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
} else if (pDstImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
pDstImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
pDstImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.srcZ = pCopyRegion->srcOffset.z;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
Expand All @@ -749,7 +748,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height);
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
Expand All @@ -764,20 +763,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
(CUdeviceptr)pSrc) != CUDA_SUCCESS;

size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
void *DstWithOffset =
static_cast<void *>(static_cast<char *>(pDst) +
(PixelSizeBytes * pCopyRegion->dstOffset.x));
size_t CopyExtentBytes = pCopyRegion->copyExtent.width;
void *DstWithOffset = static_cast<void *>(static_cast<char *>(pDst) +
pCopyRegion->dstOffset.x);

if (isCudaArray) {
UR_CHECK_ERROR(
cuMemcpyAtoHAsync(DstWithOffset, as_CUArray(pSrc),
PixelSizeBytes * pCopyRegion->srcOffset.x,
CopyExtentBytes, Stream));
UR_CHECK_ERROR(cuMemcpyAtoHAsync(DstWithOffset, as_CUArray(pSrc),
pCopyRegion->srcOffset.x,
CopyExtentBytes, Stream));
} else if (memType == CU_MEMORYTYPE_DEVICE) {
const char *SrcWithOffset =
static_cast<const char *>(pSrc) +
(pCopyRegion->srcOffset.x * PixelSizeBytes);
static_cast<const char *>(pSrc) + pCopyRegion->srcOffset.x;
UR_CHECK_ERROR(cuMemcpyDtoHAsync(DstWithOffset,
(CUdeviceptr)SrcWithOffset,
CopyExtentBytes, Stream));
Expand All @@ -787,11 +783,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
}
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = pCopyRegion->copyExtent.height;
cpy_desc.dstPitch = pDstImageDesc->rowPitch;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
Expand All @@ -808,10 +804,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.srcZ = pCopyRegion->srcOffset.z;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand All @@ -820,18 +816,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.dstHost = pDst;
cpy_desc.dstPitch = pDstImageDesc->rowPitch;
cpy_desc.dstHeight = pDstImageDesc->height;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = pCopyRegion->copyExtent.height;
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.srcZ = pCopyRegion->srcOffset.z;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand All @@ -840,7 +836,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.dstHost = pDst;
cpy_desc.dstPitch = pDstImageDesc->rowPitch;
cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height);
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
Expand Down Expand Up @@ -874,11 +870,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
(CUdeviceptr)pDst) != CUDA_SUCCESS;

CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = 0;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = 0;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = 1;
if (isSrcCudaArray) {
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand All @@ -897,11 +893,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = pCopyRegion->copyExtent.height;
if (pSrcImageDesc->rowPitch == 0) {
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand All @@ -924,35 +920,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.srcZ = pCopyRegion->srcOffset.z;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = as_CUArray(pSrc);
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = pCopyRegion->copyExtent.height;
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
} else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes;
cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x;
cpy_desc.srcY = pCopyRegion->srcOffset.y;
cpy_desc.srcZ = pCopyRegion->srcOffset.z;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes;
cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x;
cpy_desc.dstY = pCopyRegion->dstOffset.y;
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = as_CUArray(pSrc);
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
Expand Down
Loading
Loading