Skip to content

Commit 6ed78ec

Browse files
authored
[clr] rely on __builtins for memset/memcpy device functions (#4047)
## Motivation Simplify the device memset/memcpy code ## Technical Details Use compiler builtins for memset and memcpy. Compiler has recently made some changes to make sure that builtins generate good code for these operations which made it to upstream LLVM, so we can use this from now. This also changes the signature of memset function, from unsigned char to something that builtin expects, this should not result in any ABI break since `unsigned char` basically fits in an `int` (standard int promotion) and since its a `__device__` function it will be inlined and will have no traces as a function inside final code. ## JIRA ID NA ## Test Plan ## Test Result ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
1 parent e3943fe commit 6ed78ec

File tree

1 file changed

+7
-49
lines changed

1 file changed

+7
-49
lines changed

projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h

Lines changed: 7 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -888,65 +888,23 @@ unsigned __smid(void)
888888

889889
#endif // defined(__clang__) && defined(__HIP__)
890890

891-
892-
// loop unrolling
891+
// rely on `__builtin_* functions for memcpy/memset
893892
static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
894-
auto dstPtr = static_cast<unsigned char*>(dst);
895-
auto srcPtr = static_cast<const unsigned char*>(src);
896-
897-
while (size >= 4u) {
898-
dstPtr[0] = srcPtr[0];
899-
dstPtr[1] = srcPtr[1];
900-
dstPtr[2] = srcPtr[2];
901-
dstPtr[3] = srcPtr[3];
902-
903-
size -= 4u;
904-
srcPtr += 4u;
905-
dstPtr += 4u;
906-
}
907-
switch (size) {
908-
case 3:
909-
dstPtr[2] = srcPtr[2];
910-
case 2:
911-
dstPtr[1] = srcPtr[1];
912-
case 1:
913-
dstPtr[0] = srcPtr[0];
914-
}
915-
916-
return dst;
893+
return __builtin_memcpy(dst, src, size);
917894
}
918895

919-
static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
920-
auto dstPtr = static_cast<unsigned char*>(dst);
921-
922-
while (size >= 4u) {
923-
dstPtr[0] = val;
924-
dstPtr[1] = val;
925-
dstPtr[2] = val;
926-
dstPtr[3] = val;
927-
928-
size -= 4u;
929-
dstPtr += 4u;
930-
}
931-
switch (size) {
932-
case 3:
933-
dstPtr[2] = val;
934-
case 2:
935-
dstPtr[1] = val;
936-
case 1:
937-
dstPtr[0] = val;
938-
}
939-
940-
return dst;
896+
// change the value from unsigned char to int, what a builtin expects.
897+
static inline __device__ void* __hip_hc_memset(void* dst, int val, size_t size) {
898+
return __builtin_memset(dst, val, size);
941899
}
900+
942901
#ifndef __OPENMP_AMDGCN__
943902
static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
944903
return __hip_hc_memcpy(dst, src, size);
945904
}
946905

947906
static inline __device__ void* memset(void* ptr, int val, size_t size) {
948-
unsigned char val8 = static_cast<unsigned char>(val);
949-
return __hip_hc_memset(ptr, val8, size);
907+
return __hip_hc_memset(ptr, val, size);
950908
}
951909
#endif // !__OPENMP_AMDGCN__
952910

0 commit comments

Comments
 (0)