Skip to content

Commit 0c480dd

Browse files
authored
[clang][CodeGen] cast addr space of ReturnValue if needed (#154380)
Fixes a bug on AMDGPU targets where a pointer was stored as address space 5, but then loaded as address space 0. Issue found as part of [Kokkos](https://github.com/kokkos/kokkos) testing, specifically `hip.atomics` (see [core/unit_test/TestAtomics.hpp](https://github.com/kokkos/kokkos/blob/develop/core/unit_test/TestAtomics.hpp)). Issue was introduced by commit [39ec9de](39ec9de7c230) - [clang][CodeGen] sret args should always point to the alloca AS, so use that (#114062).
1 parent 955c475 commit 0c480dd

File tree

7 files changed

+75
-14
lines changed

7 files changed

+75
-14
lines changed

clang/lib/CodeGen/CGDecl.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1563,11 +1563,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
15631563
// The named return value optimization: allocate this variable in the
15641564
// return slot, so that we can elide the copy when returning this
15651565
// variable (C++0x [class.copy]p34).
1566-
address = ReturnValue;
15671566
AllocaAddr =
15681567
RawAddress(ReturnValue.emitRawPointer(*this),
15691568
ReturnValue.getElementType(), ReturnValue.getAlignment());
1570-
;
1569+
address = MaybeCastStackAddressSpace(AllocaAddr, Ty.getAddressSpace());
15711570

15721571
if (const RecordType *RecordTy = Ty->getAs<RecordType>()) {
15731572
const auto *RD = RecordTy->getOriginalDecl()->getDefinitionOrSelf();

clang/lib/CodeGen/CGExpr.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,10 @@ CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits Align,
108108
return RawAddress(Alloca, Ty, Align, KnownNonNull);
109109
}
110110

111-
RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS,
112-
CharUnits Align, const Twine &Name,
113-
llvm::Value *ArraySize,
114-
RawAddress *AllocaAddr) {
115-
RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize);
116-
if (AllocaAddr)
117-
*AllocaAddr = Alloca;
111+
RawAddress CodeGenFunction::MaybeCastStackAddressSpace(RawAddress Alloca,
112+
LangAS DestLangAS,
113+
llvm::Value *ArraySize) {
114+
118115
llvm::Value *V = Alloca.getPointer();
119116
// Alloca always returns a pointer in alloca address space, which may
120117
// be different from the type defined by the language. For example,
@@ -134,7 +131,18 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS,
134131
/*IsNonNull=*/true);
135132
}
136133

137-
return RawAddress(V, Ty, Align, KnownNonNull);
134+
return RawAddress(V, Alloca.getElementType(), Alloca.getAlignment(),
135+
KnownNonNull);
136+
}
137+
138+
RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS,
139+
CharUnits Align, const Twine &Name,
140+
llvm::Value *ArraySize,
141+
RawAddress *AllocaAddr) {
142+
RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize);
143+
if (AllocaAddr)
144+
*AllocaAddr = Alloca;
145+
return MaybeCastStackAddressSpace(Alloca, DestLangAS, ArraySize);
138146
}
139147

140148
/// CreateTempAlloca - This creates an alloca and inserts it into the entry

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2804,6 +2804,13 @@ class CodeGenFunction : public CodeGenTypeCache {
28042804
AllocaTracker Tracker;
28052805
};
28062806

2807+
private:
2808+
/// If \p Alloca is not in the same address space as \p DestLangAS, insert an
2809+
/// address space cast and return a new RawAddress based on this value.
2810+
RawAddress MaybeCastStackAddressSpace(RawAddress Alloca, LangAS DestLangAS,
2811+
llvm::Value *ArraySize = nullptr);
2812+
2813+
public:
28072814
/// CreateTempAlloca - This creates an alloca and inserts it into the entry
28082815
/// block if \p ArraySize is nullptr, otherwise inserts it at the current
28092816
/// insertion point of the builder. The caller is responsible for setting an

clang/test/CodeGenCXX/sret_cast_with_nonzero_alloca_as.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,15 @@ struct X { int z[17]; };
1010
// CHECK-NEXT: [[Y_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
1111
// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
1212
// CHECK-NEXT: [[Y_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[Y_ADDR]] to ptr
13+
// CHECK-NEXT: [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr
1314
// CHECK-NEXT: store i8 [[X]], ptr [[X_ADDR_ASCAST]], align 1
1415
// CHECK-NEXT: store i8 [[Y]], ptr [[Y_ADDR_ASCAST]], align 1
1516
// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[X_ADDR_ASCAST]], align 1
16-
// CHECK-NEXT: [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr
1717
// CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT_ASCAST]], i64 1
1818
// CHECK-NEXT: store i8 [[TMP0]], ptr [[ADD_PTR]], align 1
1919
// CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[Y_ADDR_ASCAST]], align 1
20-
// CHECK-NEXT: [[AGG_RESULT_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr
21-
// CHECK-NEXT: [[ADD_PTR2:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT_ASCAST1]], i64 2
22-
// CHECK-NEXT: store i8 [[TMP1]], ptr [[ADD_PTR2]], align 1
20+
// CHECK-NEXT: [[ADD_PTR1:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT_ASCAST]], i64 2
21+
// CHECK-NEXT: store i8 [[TMP1]], ptr [[ADD_PTR1]], align 1
2322
// CHECK-NEXT: ret void
2423
//
2524
X foo(char x, char y) {
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "bar" --version 5
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device \
4+
// RUN: -o - %s | FileCheck --check-prefix=AMDGCN --enable-var-scope %s
5+
6+
struct Foo {
7+
unsigned long long val;
8+
//
9+
__attribute__((device)) inline Foo() { val = 0; }
10+
__attribute__((device)) inline Foo(const Foo &src) { val = src.val; }
11+
__attribute__((device)) inline Foo(const volatile Foo &src) { val = src.val; }
12+
};
13+
14+
// AMDGCN-LABEL: define dso_local void @_Z3barPK3Foo(
15+
// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_FOO:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[SRC_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
16+
// AMDGCN-NEXT: [[ENTRY:.*:]]
17+
// AMDGCN-NEXT: [[RESULT_PTR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
18+
// AMDGCN-NEXT: [[SRC_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
19+
// AMDGCN-NEXT: [[DST:%.*]] = alloca [[UNION_ANON:%.*]], align 8, addrspace(5)
20+
// AMDGCN-NEXT: [[RESULT_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_PTR]] to ptr
21+
// AMDGCN-NEXT: [[SRC_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_PTR_ADDR]] to ptr
22+
// AMDGCN-NEXT: [[AGG_RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr
23+
// AMDGCN-NEXT: [[DST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST]] to ptr
24+
// AMDGCN-NEXT: store ptr addrspace(5) [[AGG_RESULT]], ptr [[RESULT_PTR_ASCAST]], align 4
25+
// AMDGCN-NEXT: store ptr [[SRC_PTR]], ptr [[SRC_PTR_ADDR_ASCAST]], align 8
26+
// AMDGCN-NEXT: call void @_ZN3FooC1Ev(ptr noundef nonnull align 8 dereferenceable(8) [[AGG_RESULT_ASCAST]]) #[[ATTR1:[0-9]+]]
27+
// AMDGCN-NEXT: store ptr [[AGG_RESULT_ASCAST]], ptr [[DST_ASCAST]], align 8
28+
// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_PTR_ADDR_ASCAST]], align 8
29+
// AMDGCN-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[STRUCT_FOO]], ptr [[TMP0]], i32 0, i32 0
30+
// AMDGCN-NEXT: [[TMP1:%.*]] = load i64, ptr [[VAL]], align 8
31+
// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DST_ASCAST]], align 8
32+
// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 0
33+
// AMDGCN-NEXT: store i64 [[TMP1]], ptr [[ARRAYIDX]], align 8
34+
// AMDGCN-NEXT: ret void
35+
//
36+
__attribute__((device)) Foo bar(const Foo *const src_ptr) {
37+
Foo result;
38+
39+
union {
40+
Foo* const ptr;
41+
unsigned long long * const ptr64;
42+
} dst = {&result};
43+
44+
dst.ptr64[0] = src_ptr->val;
45+
return result;
46+
}

clang/test/CodeGenOpenCL/addr-space-struct-arg.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
647647
// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
648648
// AMDGCN20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
649649
// AMDGCN20-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
650+
// AMDGCN20-NEXT: [[RETVAL_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[RETVAL_ASCAST]] to ptr addrspace(5)
650651
// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
651652
// AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4
652653
// AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4

clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
121121
// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
122122
// AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
123123
// AMDGCN-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr
124+
// AMDGCN-NEXT: [[RETVAL_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[RETVAL_ASCAST]] to ptr addrspace(5)
124125
// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0
125126
// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4
126127
// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4

0 commit comments

Comments
 (0)