From 880fd69c48fe0c98d045dbb3395ce034d1943a03 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Mon, 1 Sep 2025 12:01:29 +0200 Subject: [PATCH 1/5] WIP --- sycl/source/handler.cpp | 17 +++++++++++++---- .../source/adapters/level_zero/image_common.cpp | 13 +++++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 397f01983add4..43a9045bf20cf 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -262,8 +262,16 @@ fill_copy_args(detail::handler_impl *impl, auto ZCopyExtentComponent = detail::fill_image_type(SrcImgDesc, UrSrcDesc); detail::fill_image_type(DestImgDesc, UrDestDesc); - impl->MSrcOffset = {SrcOffset[0], SrcOffset[1], SrcOffset[2]}; - impl->MDestOffset = {DestOffset[0], DestOffset[1], DestOffset[2]}; + // Copy args computed here are directly passed to UR. Various offsets and + // extents end up passed as ur_rect_offset_t and ur_rect_region_t. Both those + // structs expect theirfirst component to be in bytes, not in pixels + size_t SrcPixelSize = SrcImgDesc.num_channels * get_channel_size(SrcImgDesc); + size_t DestPixelSize = + DestImgDesc.num_channels * get_channel_size(DestImgDesc); + + impl->MSrcOffset = {SrcOffset[0] * SrcPixelSize, SrcOffset[1], SrcOffset[2]}; + impl->MDestOffset = {DestOffset[0] * DestPixelSize, DestOffset[1], + DestOffset[2]}; impl->MSrcImageDesc = UrSrcDesc; impl->MDstImageDesc = UrDestDesc; impl->MSrcImageFormat = UrSrcFormat; @@ -271,9 +279,10 @@ fill_copy_args(detail::handler_impl *impl, impl->MImageCopyFlags = ImageCopyFlags; if (CopyExtent.size() != 0) { - impl->MCopyExtent = {CopyExtent[0], CopyExtent[1], CopyExtent[2]}; + impl->MCopyExtent = {CopyExtent[0] * SrcPixelSize, CopyExtent[1], + CopyExtent[2]}; } else { - impl->MCopyExtent = {SrcImgDesc.width, SrcImgDesc.height, + impl->MCopyExtent = {SrcImgDesc.width * SrcPixelSize, SrcImgDesc.height, ZCopyExtentComponent}; } diff --git a/unified-runtime/source/adapters/level_zero/image_common.cpp b/unified-runtime/source/adapters/level_zero/image_common.cpp index 10f2ef1430ce2..54b696dfb6651 100644 --- a/unified-runtime/source/adapters/level_zero/image_common.cpp +++ b/unified-runtime/source/adapters/level_zero/image_common.cpp @@ -747,11 +747,20 @@ ur_result_t getImageRegionHelper(ze_image_desc_t ZeImageDesc, UR_RESULT_ERROR_INVALID_VALUE); #endif // !NDEBUG - uint32_t OriginX = ur_cast(Origin->x); + // ur_rect_offset_t and ur_rect_region_t describe first component using bytes + // ze_image_region_t however uses pixels for it + + // TODO: this is less efficient than a direct calculation of a pixel size + // using ze_image_format_t + ur_image_format_t UrImageFormat; + UR_CALL(ze2urImageFormat(ZeImageDesc.format, &UrImageFormat)); + uint32_t PixelSizeBytes = getPixelSizeBytes(&UrImageFormat); + + uint32_t OriginX = ur_cast(Origin->x) / PixelSizeBytes; uint32_t OriginY = ur_cast(Origin->y); uint32_t OriginZ = ur_cast(Origin->z); - uint32_t Width = ur_cast(Region->width); + uint32_t Width = ur_cast(Region->width) / PixelSizeBytes; uint32_t Height = (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY) ? ZeImageDesc.arraylevels : ur_cast(Region->height); From 7837acfa0525b7edcdaf8a4bb6faa0a943053c5a Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Wed, 27 Aug 2025 07:02:47 -0700 Subject: [PATCH 2/5] Fix CUDA & HIP (untested locally) --- .../source/adapters/cuda/image.cpp | 98 ++++++++--------- unified-runtime/source/adapters/hip/image.cpp | 100 ++++++++---------- 2 files changed, 94 insertions(+), 104 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/image.cpp b/unified-runtime/source/adapters/cuda/image.cpp index 4f2a83d0878c8..5c5ec3367442a 100644 --- a/unified-runtime/source/adapters/cuda/image.cpp +++ b/unified-runtime/source/adapters/cuda/image.cpp @@ -645,7 +645,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( }; unsigned int NumChannels = 0; - size_t PixelSizeBytes = 0; + [[maybe_unused]] size_t PixelSizeBytes = 0; UR_CALL(urCalculateNumChannels(pSrcImageFormat->channelOrder, &NumChannels)); @@ -673,19 +673,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pDst) != CUDA_SUCCESS; - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - const char *SrcWithOffset = static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; + const char *SrcWithOffset = + static_cast(pSrc) + pCopyRegion->srcOffset.x; if (isCudaArray) { - UR_CHECK_ERROR(cuMemcpyHtoAAsync( - (CUarray)pDst, pCopyRegion->dstOffset.x * PixelSizeBytes, - static_cast(SrcWithOffset), CopyExtentBytes, - Stream)); + UR_CHECK_ERROR( + cuMemcpyHtoAAsync((CUarray)pDst, pCopyRegion->dstOffset.x, + static_cast(SrcWithOffset), + CopyExtentBytes, Stream)); } else if (memType == CU_MEMORYTYPE_DEVICE) { - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); UR_CHECK_ERROR( cuMemcpyHtoDAsync((CUdeviceptr)DstWithOffset, static_cast(SrcWithOffset), @@ -698,11 +697,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( CUDA_MEMCPY2D cpy_desc = {}; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.srcHost = pSrc; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.srcPitch = pSrcImageDesc->rowPitch; if (pDstImageDesc->rowPitch == 0) { @@ -717,10 +716,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pDstImageDesc->type == UR_MEM_TYPE_IMAGE3D) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; @@ -729,7 +728,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = pSrcImageDesc->height; cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -737,10 +736,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pDstImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pDstImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; @@ -749,7 +748,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -764,20 +763,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pSrc) != CUDA_SUCCESS; - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + size_t CopyExtentBytes = CopyRegion->copyExtent.width; + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); if (isCudaArray) { - UR_CHECK_ERROR( - cuMemcpyAtoHAsync(DstWithOffset, as_CUArray(pSrc), - PixelSizeBytes * pCopyRegion->srcOffset.x, - CopyExtentBytes, Stream)); + UR_CHECK_ERROR(cuMemcpyAtoHAsync(DstWithOffset, as_CUArray(pSrc), + pCopyRegion->srcOffset.x, + CopyExtentBytes, Stream)); } else if (memType == CU_MEMORYTYPE_DEVICE) { const char *SrcWithOffset = - static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + static_cast(pSrc) + pCopyRegion->srcOffset.x; UR_CHECK_ERROR(cuMemcpyDtoHAsync(DstWithOffset, (CUdeviceptr)SrcWithOffset, CopyExtentBytes, Stream)); @@ -787,11 +783,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( } } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; @@ -808,10 +804,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -820,7 +816,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = pDstImageDesc->height; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -828,10 +824,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -840,7 +836,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -874,11 +870,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( (CUdeviceptr)pDst) != CUDA_SUCCESS; CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = 0; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = 0; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = 1; if (isSrcCudaArray) { cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -897,11 +893,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; if (pSrcImageDesc->rowPitch == 0) { cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -924,17 +920,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -942,17 +938,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); diff --git a/unified-runtime/source/adapters/hip/image.cpp b/unified-runtime/source/adapters/hip/image.cpp index 0874df4c8ce85..05f04f1ff9eea 100644 --- a/unified-runtime/source/adapters/hip/image.cpp +++ b/unified-runtime/source/adapters/hip/image.cpp @@ -635,7 +635,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_RESULT_ERROR_INVALID_ARGUMENT); unsigned int NumChannels = 0; - size_t PixelSizeBytes = 0; + [[maybe_unused]] size_t PixelSizeBytes = 0; UR_CALL(urCalculateNumChannels(pSrcImageFormat->channelOrder, &NumChannels)); @@ -665,29 +665,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( memType == hipMemoryTypeArray, UR_RESULT_ERROR_INVALID_VALUE); - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - const char *SrcWithOffset = static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; + const char *SrcWithOffset = + static_cast(pSrc) + pCopyRegion->srcOffset.x; if (memType == hipMemoryTypeArray) { // HIP doesn not provide async copies between host and image arrays // memory in versions earlier than 6.2. #if HIP_VERSION >= 60200000 - UR_CHECK_ERROR( - hipMemcpyHtoAAsync(static_cast(pDst), - pCopyRegion->dstOffset.x * PixelSizeBytes, - static_cast(SrcWithOffset), - CopyExtentBytes, Stream)); + UR_CHECK_ERROR(hipMemcpyHtoAAsync( + static_cast(pDst), pCopyRegion->dstOffset.x, + static_cast(SrcWithOffset), CopyExtentBytes, + Stream)); #else UR_CHECK_ERROR(hipMemcpyHtoA( - static_cast(pDst), - pCopyRegion->dstOffset.x * PixelSizeBytes, + static_cast(pDst), pCopyRegion->dstOffset.x, static_cast(SrcWithOffset), CopyExtentBytes)); #endif } else if (memType == hipMemoryTypeDevice) { - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); UR_CHECK_ERROR(hipMemcpyHtoDAsync( static_cast(DstWithOffset), const_cast(static_cast(SrcWithOffset)), @@ -700,9 +697,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( hip_Memcpy2D cpy_desc = {}; cpy_desc.srcMemoryType = hipMemoryTypeHost; cpy_desc.srcHost = pSrc; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.srcPitch = pSrcImageDesc->rowPitch; if (pDstImageDesc->rowPitch == 0) { @@ -714,15 +711,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstDevice = static_cast(pDst); cpy_desc.dstPitch = pDstImageDesc->rowPitch; } - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pDstImageDesc->type == UR_MEM_TYPE_IMAGE3D) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeHost; @@ -731,7 +728,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = pSrcImageDesc->height; cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; // 'hipMemcpy3DAsync' requires us to correctly create 'hipMemcpy3DParms' @@ -741,10 +738,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pDstImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pDstImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeHost; @@ -753,7 +750,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = std::max(MinCopyHeight, pSrcImageDesc->height); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(MinCopyHeight, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; @@ -774,10 +771,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( memType == hipMemoryTypeArray, UR_RESULT_ERROR_INVALID_VALUE); - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); if (memType == hipMemoryTypeArray) { // HIP doesn not provide async copies between image arrays and host @@ -785,17 +781,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( #if HIP_VERSION >= 60200000 UR_CHECK_ERROR(hipMemcpyAtoHAsync( DstWithOffset, static_cast(const_cast(pSrc)), - PixelSizeBytes * pCopyRegion->srcOffset.x, CopyExtentBytes, - Stream)); + pCopyRegion->srcOffset.x, CopyExtentBytes, Stream)); #else UR_CHECK_ERROR(hipMemcpyAtoH( DstWithOffset, static_cast(const_cast(pSrc)), - PixelSizeBytes * pCopyRegion->srcOffset.x, CopyExtentBytes)); + pCopyRegion->srcOffset.x, CopyExtentBytes)); #endif } else if (memType == hipMemoryTypeDevice) { const char *SrcWithOffset = - static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + static_cast(pSrc) + pCopyRegion->srcOffset.x; UR_CHECK_ERROR(hipMemcpyDtoHAsync( DstWithOffset, static_cast(const_cast(SrcWithOffset)), @@ -806,9 +800,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( } } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { hip_Memcpy2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstMemoryType = hipMemoryTypeHost; cpy_desc.dstHost = pDst; @@ -825,15 +819,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstMemoryType = hipMemoryTypeHost; cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; @@ -842,7 +836,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = pDstImageDesc->height; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; // 'hipMemcpy3DAsync' requires us to correctly create @@ -853,10 +847,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; @@ -865,7 +859,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = std::max(MinCopyHeight, pDstImageDesc->height); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(MinCopyHeight, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; @@ -889,43 +883,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( // the end if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE1D) { hip_Memcpy2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = 0; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = 0; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = 1; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { hip_Memcpy2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; // 'hipMemcpy3DAsync' requires us to correctly create @@ -936,17 +930,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(MinCopyHeight, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; From dbcdff56113c0dbd373a69b88fd30088c82f2e9c Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 28 Aug 2025 03:30:51 -0700 Subject: [PATCH 3/5] Fix CUDA adapter build --- unified-runtime/source/adapters/cuda/image.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/cuda/image.cpp b/unified-runtime/source/adapters/cuda/image.cpp index 5c5ec3367442a..4f1f1892cddfd 100644 --- a/unified-runtime/source/adapters/cuda/image.cpp +++ b/unified-runtime/source/adapters/cuda/image.cpp @@ -763,7 +763,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pSrc) != CUDA_SUCCESS; - size_t CopyExtentBytes = CopyRegion->copyExtent.width; + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; void *DstWithOffset = static_cast(static_cast(pDst) + pCopyRegion->dstOffset.x); From 7b99dc89fd42d9b36f83fbaa2e56a425d10b0428 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Wed, 3 Sep 2025 16:23:17 +0200 Subject: [PATCH 4/5] Limit the change to bindless images only --- .../adapters/level_zero/image_common.cpp | 68 ++++++++++++------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/image_common.cpp b/unified-runtime/source/adapters/level_zero/image_common.cpp index 54b696dfb6651..e4cbd8edbe9a8 100644 --- a/unified-runtime/source/adapters/level_zero/image_common.cpp +++ b/unified-runtime/source/adapters/level_zero/image_common.cpp @@ -747,20 +747,11 @@ ur_result_t getImageRegionHelper(ze_image_desc_t ZeImageDesc, UR_RESULT_ERROR_INVALID_VALUE); #endif // !NDEBUG - // ur_rect_offset_t and ur_rect_region_t describe first component using bytes - // ze_image_region_t however uses pixels for it - - // TODO: this is less efficient than a direct calculation of a pixel size - // using ze_image_format_t - ur_image_format_t UrImageFormat; - UR_CALL(ze2urImageFormat(ZeImageDesc.format, &UrImageFormat)); - uint32_t PixelSizeBytes = getPixelSizeBytes(&UrImageFormat); - - uint32_t OriginX = ur_cast(Origin->x) / PixelSizeBytes; + uint32_t OriginX = ur_cast(Origin->x); uint32_t OriginY = ur_cast(Origin->y); uint32_t OriginZ = ur_cast(Origin->z); - uint32_t Width = ur_cast(Region->width) / PixelSizeBytes; + uint32_t Width = ur_cast(Region->width); uint32_t Height = (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY) ? ZeImageDesc.arraylevels : ur_cast(Region->height); @@ -773,6 +764,27 @@ ur_result_t getImageRegionHelper(ze_image_desc_t ZeImageDesc, return UR_RESULT_SUCCESS; } +// ur_rect_offset_t and ur_rect_region_t describe their first component as +// bytes, whilst ze_image_region_t uses pixels. +// +// However, the getImageRegionHelper above is used for both bindless and regular +// images and APIs for the latter explicitly document that ur_rect_offset_t and +// ur_rect_region_t are misused and all their component are treated as pixels. +// +// As such, a new helper function for translation between UR and L0 formats is +// introduced instead of modifying the existing one above. +static ur_result_t getZeImageRegionHelper(ze_image_desc_t ZeImageDesc, + size_t PixelSizeInBytes, + ur_rect_offset_t *Origin, + ur_rect_region_t *Region, + ze_image_region_t &ZeRegion) { + UR_CALL(getImageRegionHelper(ZeImageDesc, Origin, Region, ZeRegion)); + ZeRegion.originX /= PixelSizeInBytes; + ZeRegion.width /= PixelSizeInBytes; + + return UR_RESULT_SUCCESS; +} + ur_result_t bindlessImagesHandleCopyFlags( const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, const ur_image_desc_t *pDstImageDesc, @@ -785,6 +797,8 @@ ur_result_t bindlessImagesHandleCopyFlags( ZeStruct zeSrcImageDesc; ur2zeImageDesc(pSrcImageFormat, pSrcImageDesc, zeSrcImageDesc); + uint32_t SrcPixelSizeInBytes = getPixelSizeBytes(pSrcImageFormat); + uint32_t DstPixelSizeInBytes = getPixelSizeBytes(pDstImageFormat); switch (imageCopyFlags) { case UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE: { @@ -794,15 +808,15 @@ ur_result_t bindlessImagesHandleCopyFlags( // Copy to Non-USM memory ze_image_region_t DstRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->dstOffset, - &pCopyRegion->copyExtent, DstRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, SrcPixelSizeInBytes, + &pCopyRegion->dstOffset, + &pCopyRegion->copyExtent, DstRegion)); auto *urDstImg = static_cast(pDst); - const char *SrcPtr = - static_cast(pSrc) + - pCopyRegion->srcOffset.z * SrcSlicePitch + - pCopyRegion->srcOffset.y * SrcRowPitch + - pCopyRegion->srcOffset.x * getPixelSizeBytes(pSrcImageFormat); + const char *SrcPtr = static_cast(pSrc) + + pCopyRegion->srcOffset.z * SrcSlicePitch + + pCopyRegion->srcOffset.y * SrcRowPitch + + pCopyRegion->srcOffset.x; ZE2UR_CALL(zeCommandListAppendImageCopyFromMemoryExt, (ZeCommandList, urDstImg->getZeImage(), SrcPtr, &DstRegion, @@ -837,15 +851,15 @@ ur_result_t bindlessImagesHandleCopyFlags( if (pSrcImageDesc->rowPitch == 0) { // Copy from Non-USM memory to host ze_image_region_t SrcRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->srcOffset, - &pCopyRegion->copyExtent, SrcRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, SrcPixelSizeInBytes, + &pCopyRegion->srcOffset, + &pCopyRegion->copyExtent, SrcRegion)); auto *urSrcImg = reinterpret_cast(pSrc); char *DstPtr = static_cast(pDst) + pCopyRegion->dstOffset.z * DstSlicePitch + - pCopyRegion->dstOffset.y * DstRowPitch + - pCopyRegion->dstOffset.x * getPixelSizeBytes(pDstImageFormat); + pCopyRegion->dstOffset.y * DstRowPitch + pCopyRegion->dstOffset.x; ZE2UR_CALL(zeCommandListAppendImageCopyToMemoryExt, (ZeCommandList, DstPtr, urSrcImg->getZeImage(), &SrcRegion, DstRowPitch, DstSlicePitch, zeSignalEvent, numWaitEvents, @@ -875,11 +889,13 @@ ur_result_t bindlessImagesHandleCopyFlags( }; case UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE: { ze_image_region_t DstRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->dstOffset, - &pCopyRegion->copyExtent, DstRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, DstPixelSizeInBytes, + &pCopyRegion->dstOffset, + &pCopyRegion->copyExtent, DstRegion)); ze_image_region_t SrcRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->srcOffset, - &pCopyRegion->copyExtent, SrcRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, SrcPixelSizeInBytes, + &pCopyRegion->srcOffset, + &pCopyRegion->copyExtent, SrcRegion)); auto *urImgSrc = reinterpret_cast(pSrc); auto *urImgDst = reinterpret_cast(pDst); From 6cdf6780e9ea6678d7421f5581048c574cf72024 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Mon, 15 Sep 2025 15:47:36 +0200 Subject: [PATCH 5/5] Update sycl/source/handler.cpp Co-authored-by: Steffen Larsen --- sycl/source/handler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 43a9045bf20cf..e0a06e2199c64 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -264,7 +264,7 @@ fill_copy_args(detail::handler_impl *impl, // Copy args computed here are directly passed to UR. Various offsets and // extents end up passed as ur_rect_offset_t and ur_rect_region_t. Both those - // structs expect theirfirst component to be in bytes, not in pixels + // structs expect their first component to be in bytes, not in pixels size_t SrcPixelSize = SrcImgDesc.num_channels * get_channel_size(SrcImgDesc); size_t DestPixelSize = DestImgDesc.num_channels * get_channel_size(DestImgDesc);