diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 71aa92124f1c8..d74d96f397093 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -267,8 +267,16 @@ fill_copy_args(detail::handler_impl *impl, auto ZCopyExtentComponent = detail::fill_image_type(SrcImgDesc, UrSrcDesc); detail::fill_image_type(DestImgDesc, UrDestDesc); - impl->MSrcOffset = {SrcOffset[0], SrcOffset[1], SrcOffset[2]}; - impl->MDestOffset = {DestOffset[0], DestOffset[1], DestOffset[2]}; + // Copy args computed here are directly passed to UR. Various offsets and + // extents end up passed as ur_rect_offset_t and ur_rect_region_t. Both those + // structs expect their first component to be in bytes, not in pixels + size_t SrcPixelSize = SrcImgDesc.num_channels * get_channel_size(SrcImgDesc); + size_t DestPixelSize = + DestImgDesc.num_channels * get_channel_size(DestImgDesc); + + impl->MSrcOffset = {SrcOffset[0] * SrcPixelSize, SrcOffset[1], SrcOffset[2]}; + impl->MDestOffset = {DestOffset[0] * DestPixelSize, DestOffset[1], + DestOffset[2]}; impl->MSrcImageDesc = UrSrcDesc; impl->MDstImageDesc = UrDestDesc; impl->MSrcImageFormat = UrSrcFormat; @@ -276,9 +284,10 @@ fill_copy_args(detail::handler_impl *impl, impl->MImageCopyFlags = ImageCopyFlags; if (CopyExtent.size() != 0) { - impl->MCopyExtent = {CopyExtent[0], CopyExtent[1], CopyExtent[2]}; + impl->MCopyExtent = {CopyExtent[0] * SrcPixelSize, CopyExtent[1], + CopyExtent[2]}; } else { - impl->MCopyExtent = {SrcImgDesc.width, SrcImgDesc.height, + impl->MCopyExtent = {SrcImgDesc.width * SrcPixelSize, SrcImgDesc.height, ZCopyExtentComponent}; } diff --git a/unified-runtime/source/adapters/cuda/image.cpp b/unified-runtime/source/adapters/cuda/image.cpp index 4f2a83d0878c8..4f1f1892cddfd 100644 --- a/unified-runtime/source/adapters/cuda/image.cpp +++ b/unified-runtime/source/adapters/cuda/image.cpp @@ -645,7 +645,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( }; unsigned int NumChannels = 0; - size_t PixelSizeBytes = 0; + [[maybe_unused]] size_t PixelSizeBytes = 0; UR_CALL(urCalculateNumChannels(pSrcImageFormat->channelOrder, &NumChannels)); @@ -673,19 +673,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pDst) != CUDA_SUCCESS; - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - const char *SrcWithOffset = static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; + const char *SrcWithOffset = + static_cast(pSrc) + pCopyRegion->srcOffset.x; if (isCudaArray) { - UR_CHECK_ERROR(cuMemcpyHtoAAsync( - (CUarray)pDst, pCopyRegion->dstOffset.x * PixelSizeBytes, - static_cast(SrcWithOffset), CopyExtentBytes, - Stream)); + UR_CHECK_ERROR( + cuMemcpyHtoAAsync((CUarray)pDst, pCopyRegion->dstOffset.x, + static_cast(SrcWithOffset), + CopyExtentBytes, Stream)); } else if (memType == CU_MEMORYTYPE_DEVICE) { - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); UR_CHECK_ERROR( cuMemcpyHtoDAsync((CUdeviceptr)DstWithOffset, static_cast(SrcWithOffset), @@ -698,11 +697,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( CUDA_MEMCPY2D cpy_desc = {}; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; cpy_desc.srcHost = pSrc; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.srcPitch = pSrcImageDesc->rowPitch; if (pDstImageDesc->rowPitch == 0) { @@ -717,10 +716,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pDstImageDesc->type == UR_MEM_TYPE_IMAGE3D) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; @@ -729,7 +728,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = pSrcImageDesc->height; cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -737,10 +736,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pDstImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pDstImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; @@ -749,7 +748,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -764,20 +763,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pSrc) != CUDA_SUCCESS; - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); if (isCudaArray) { - UR_CHECK_ERROR( - cuMemcpyAtoHAsync(DstWithOffset, as_CUArray(pSrc), - PixelSizeBytes * pCopyRegion->srcOffset.x, - CopyExtentBytes, Stream)); + UR_CHECK_ERROR(cuMemcpyAtoHAsync(DstWithOffset, as_CUArray(pSrc), + pCopyRegion->srcOffset.x, + CopyExtentBytes, Stream)); } else if (memType == CU_MEMORYTYPE_DEVICE) { const char *SrcWithOffset = - static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + static_cast(pSrc) + pCopyRegion->srcOffset.x; UR_CHECK_ERROR(cuMemcpyDtoHAsync(DstWithOffset, (CUdeviceptr)SrcWithOffset, CopyExtentBytes, Stream)); @@ -787,11 +783,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( } } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; @@ -808,10 +804,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -820,7 +816,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = pDstImageDesc->height; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -828,10 +824,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -840,7 +836,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -874,11 +870,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( (CUdeviceptr)pDst) != CUDA_SUCCESS; CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = 0; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = 0; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = 1; if (isSrcCudaArray) { cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -897,11 +893,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; if (pSrcImageDesc->rowPitch == 0) { cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; @@ -924,17 +920,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); @@ -942,17 +938,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.srcArray = as_CUArray(pSrc); cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); diff --git a/unified-runtime/source/adapters/hip/image.cpp b/unified-runtime/source/adapters/hip/image.cpp index 0874df4c8ce85..05f04f1ff9eea 100644 --- a/unified-runtime/source/adapters/hip/image.cpp +++ b/unified-runtime/source/adapters/hip/image.cpp @@ -635,7 +635,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_RESULT_ERROR_INVALID_ARGUMENT); unsigned int NumChannels = 0; - size_t PixelSizeBytes = 0; + [[maybe_unused]] size_t PixelSizeBytes = 0; UR_CALL(urCalculateNumChannels(pSrcImageFormat->channelOrder, &NumChannels)); @@ -665,29 +665,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( memType == hipMemoryTypeArray, UR_RESULT_ERROR_INVALID_VALUE); - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - const char *SrcWithOffset = static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; + const char *SrcWithOffset = + static_cast(pSrc) + pCopyRegion->srcOffset.x; if (memType == hipMemoryTypeArray) { // HIP doesn not provide async copies between host and image arrays // memory in versions earlier than 6.2. #if HIP_VERSION >= 60200000 - UR_CHECK_ERROR( - hipMemcpyHtoAAsync(static_cast(pDst), - pCopyRegion->dstOffset.x * PixelSizeBytes, - static_cast(SrcWithOffset), - CopyExtentBytes, Stream)); + UR_CHECK_ERROR(hipMemcpyHtoAAsync( + static_cast(pDst), pCopyRegion->dstOffset.x, + static_cast(SrcWithOffset), CopyExtentBytes, + Stream)); #else UR_CHECK_ERROR(hipMemcpyHtoA( - static_cast(pDst), - pCopyRegion->dstOffset.x * PixelSizeBytes, + static_cast(pDst), pCopyRegion->dstOffset.x, static_cast(SrcWithOffset), CopyExtentBytes)); #endif } else if (memType == hipMemoryTypeDevice) { - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); UR_CHECK_ERROR(hipMemcpyHtoDAsync( static_cast(DstWithOffset), const_cast(static_cast(SrcWithOffset)), @@ -700,9 +697,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( hip_Memcpy2D cpy_desc = {}; cpy_desc.srcMemoryType = hipMemoryTypeHost; cpy_desc.srcHost = pSrc; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.srcPitch = pSrcImageDesc->rowPitch; if (pDstImageDesc->rowPitch == 0) { @@ -714,15 +711,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstDevice = static_cast(pDst); cpy_desc.dstPitch = pDstImageDesc->rowPitch; } - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pDstImageDesc->type == UR_MEM_TYPE_IMAGE3D) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeHost; @@ -731,7 +728,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = pSrcImageDesc->height; cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; // 'hipMemcpy3DAsync' requires us to correctly create 'hipMemcpy3DParms' @@ -741,10 +738,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pDstImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pDstImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeHost; @@ -753,7 +750,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.srcHeight = std::max(MinCopyHeight, pSrcImageDesc->height); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(MinCopyHeight, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; @@ -774,10 +771,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( memType == hipMemoryTypeArray, UR_RESULT_ERROR_INVALID_VALUE); - size_t CopyExtentBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; - void *DstWithOffset = - static_cast(static_cast(pDst) + - (PixelSizeBytes * pCopyRegion->dstOffset.x)); + size_t CopyExtentBytes = pCopyRegion->copyExtent.width; + void *DstWithOffset = static_cast(static_cast(pDst) + + pCopyRegion->dstOffset.x); if (memType == hipMemoryTypeArray) { // HIP doesn not provide async copies between image arrays and host @@ -785,17 +781,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( #if HIP_VERSION >= 60200000 UR_CHECK_ERROR(hipMemcpyAtoHAsync( DstWithOffset, static_cast(const_cast(pSrc)), - PixelSizeBytes * pCopyRegion->srcOffset.x, CopyExtentBytes, - Stream)); + pCopyRegion->srcOffset.x, CopyExtentBytes, Stream)); #else UR_CHECK_ERROR(hipMemcpyAtoH( DstWithOffset, static_cast(const_cast(pSrc)), - PixelSizeBytes * pCopyRegion->srcOffset.x, CopyExtentBytes)); + pCopyRegion->srcOffset.x, CopyExtentBytes)); #endif } else if (memType == hipMemoryTypeDevice) { const char *SrcWithOffset = - static_cast(pSrc) + - (pCopyRegion->srcOffset.x * PixelSizeBytes); + static_cast(pSrc) + pCopyRegion->srcOffset.x; UR_CHECK_ERROR(hipMemcpyDtoHAsync( DstWithOffset, static_cast(const_cast(SrcWithOffset)), @@ -806,9 +800,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( } } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { hip_Memcpy2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstMemoryType = hipMemoryTypeHost; cpy_desc.dstHost = pDst; @@ -825,15 +819,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstMemoryType = hipMemoryTypeHost; cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; @@ -842,7 +836,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = pDstImageDesc->height; - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; // 'hipMemcpy3DAsync' requires us to correctly create @@ -853,10 +847,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; @@ -865,7 +859,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( cpy_desc.dstHost = pDst; cpy_desc.dstPitch = pDstImageDesc->rowPitch; cpy_desc.dstHeight = std::max(MinCopyHeight, pDstImageDesc->height); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(MinCopyHeight, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; @@ -889,43 +883,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( // the end if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE1D) { hip_Memcpy2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = 0; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = 0; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = 1; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D) { hip_Memcpy2D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&cpy_desc, Stream)); } else if (pSrcImageDesc->type == UR_MEM_TYPE_IMAGE3D) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = pCopyRegion->copyExtent.height; cpy_desc.Depth = pCopyRegion->copyExtent.depth; // 'hipMemcpy3DAsync' requires us to correctly create @@ -936,17 +930,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( pSrcImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY || pSrcImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) { HIP_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x * PixelSizeBytes; + cpy_desc.srcXInBytes = pCopyRegion->srcOffset.x; cpy_desc.srcY = pCopyRegion->srcOffset.y; cpy_desc.srcZ = pCopyRegion->srcOffset.z; - cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x * PixelSizeBytes; + cpy_desc.dstXInBytes = pCopyRegion->dstOffset.x; cpy_desc.dstY = pCopyRegion->dstOffset.y; cpy_desc.dstZ = pCopyRegion->dstOffset.z; cpy_desc.srcMemoryType = hipMemoryTypeArray; cpy_desc.srcArray = static_cast(const_cast(pSrc)); cpy_desc.dstMemoryType = hipMemoryTypeArray; cpy_desc.dstArray = static_cast(pDst); - cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width; + cpy_desc.WidthInBytes = pCopyRegion->copyExtent.width; cpy_desc.Height = std::max(MinCopyHeight, pCopyRegion->copyExtent.height); cpy_desc.Depth = pCopyRegion->copyExtent.depth; diff --git a/unified-runtime/source/adapters/level_zero/image_common.cpp b/unified-runtime/source/adapters/level_zero/image_common.cpp index 10f2ef1430ce2..e4cbd8edbe9a8 100644 --- a/unified-runtime/source/adapters/level_zero/image_common.cpp +++ b/unified-runtime/source/adapters/level_zero/image_common.cpp @@ -764,6 +764,27 @@ ur_result_t getImageRegionHelper(ze_image_desc_t ZeImageDesc, return UR_RESULT_SUCCESS; } +// ur_rect_offset_t and ur_rect_region_t describe their first component as +// bytes, whilst ze_image_region_t uses pixels. +// +// However, the getImageRegionHelper above is used for both bindless and regular +// images and APIs for the latter explicitly document that ur_rect_offset_t and +// ur_rect_region_t are misused and all their component are treated as pixels. +// +// As such, a new helper function for translation between UR and L0 formats is +// introduced instead of modifying the existing one above. +static ur_result_t getZeImageRegionHelper(ze_image_desc_t ZeImageDesc, + size_t PixelSizeInBytes, + ur_rect_offset_t *Origin, + ur_rect_region_t *Region, + ze_image_region_t &ZeRegion) { + UR_CALL(getImageRegionHelper(ZeImageDesc, Origin, Region, ZeRegion)); + ZeRegion.originX /= PixelSizeInBytes; + ZeRegion.width /= PixelSizeInBytes; + + return UR_RESULT_SUCCESS; +} + ur_result_t bindlessImagesHandleCopyFlags( const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, const ur_image_desc_t *pDstImageDesc, @@ -776,6 +797,8 @@ ur_result_t bindlessImagesHandleCopyFlags( ZeStruct zeSrcImageDesc; ur2zeImageDesc(pSrcImageFormat, pSrcImageDesc, zeSrcImageDesc); + uint32_t SrcPixelSizeInBytes = getPixelSizeBytes(pSrcImageFormat); + uint32_t DstPixelSizeInBytes = getPixelSizeBytes(pDstImageFormat); switch (imageCopyFlags) { case UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE: { @@ -785,15 +808,15 @@ ur_result_t bindlessImagesHandleCopyFlags( // Copy to Non-USM memory ze_image_region_t DstRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->dstOffset, - &pCopyRegion->copyExtent, DstRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, SrcPixelSizeInBytes, + &pCopyRegion->dstOffset, + &pCopyRegion->copyExtent, DstRegion)); auto *urDstImg = static_cast(pDst); - const char *SrcPtr = - static_cast(pSrc) + - pCopyRegion->srcOffset.z * SrcSlicePitch + - pCopyRegion->srcOffset.y * SrcRowPitch + - pCopyRegion->srcOffset.x * getPixelSizeBytes(pSrcImageFormat); + const char *SrcPtr = static_cast(pSrc) + + pCopyRegion->srcOffset.z * SrcSlicePitch + + pCopyRegion->srcOffset.y * SrcRowPitch + + pCopyRegion->srcOffset.x; ZE2UR_CALL(zeCommandListAppendImageCopyFromMemoryExt, (ZeCommandList, urDstImg->getZeImage(), SrcPtr, &DstRegion, @@ -828,15 +851,15 @@ ur_result_t bindlessImagesHandleCopyFlags( if (pSrcImageDesc->rowPitch == 0) { // Copy from Non-USM memory to host ze_image_region_t SrcRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->srcOffset, - &pCopyRegion->copyExtent, SrcRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, SrcPixelSizeInBytes, + &pCopyRegion->srcOffset, + &pCopyRegion->copyExtent, SrcRegion)); auto *urSrcImg = reinterpret_cast(pSrc); char *DstPtr = static_cast(pDst) + pCopyRegion->dstOffset.z * DstSlicePitch + - pCopyRegion->dstOffset.y * DstRowPitch + - pCopyRegion->dstOffset.x * getPixelSizeBytes(pDstImageFormat); + pCopyRegion->dstOffset.y * DstRowPitch + pCopyRegion->dstOffset.x; ZE2UR_CALL(zeCommandListAppendImageCopyToMemoryExt, (ZeCommandList, DstPtr, urSrcImg->getZeImage(), &SrcRegion, DstRowPitch, DstSlicePitch, zeSignalEvent, numWaitEvents, @@ -866,11 +889,13 @@ ur_result_t bindlessImagesHandleCopyFlags( }; case UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE: { ze_image_region_t DstRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->dstOffset, - &pCopyRegion->copyExtent, DstRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, DstPixelSizeInBytes, + &pCopyRegion->dstOffset, + &pCopyRegion->copyExtent, DstRegion)); ze_image_region_t SrcRegion; - UR_CALL(getImageRegionHelper(zeSrcImageDesc, &pCopyRegion->srcOffset, - &pCopyRegion->copyExtent, SrcRegion)); + UR_CALL(getZeImageRegionHelper(zeSrcImageDesc, SrcPixelSizeInBytes, + &pCopyRegion->srcOffset, + &pCopyRegion->copyExtent, SrcRegion)); auto *urImgSrc = reinterpret_cast(pSrc); auto *urImgDst = reinterpret_cast(pDst);