Skip to content

Commit d494eb0

Browse files
authored
[NVPTX] Skip numbering unreferenced virtual registers (readability) (#154391)
When assigning numbers to registers, skip any with neither uses nor defs. This is will not have any impact at all on the final SASS but it makes for slightly more readable PTX. This change should also ensure that future minor changes are less likely to cause noisy diffs in register numbering.
1 parent 13faa15 commit d494eb0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3640
-3652
lines changed

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,7 +1458,6 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
14581458
// Map the global virtual register number to a register class specific
14591459
// virtual register number starting from 1 with that class.
14601460
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1461-
//unsigned numRegClasses = TRI->getNumRegClasses();
14621461

14631462
// Emit the Fake Stack Object
14641463
const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1479,13 +1478,12 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
14791478
// global virtual
14801479
// register number and the per class virtual register number.
14811480
// We use the per class virtual register number in the ptx output.
1482-
unsigned int numVRs = MRI->getNumVirtRegs();
1483-
for (unsigned i = 0; i < numVRs; i++) {
1484-
Register vr = Register::index2VirtReg(i);
1485-
const TargetRegisterClass *RC = MRI->getRegClass(vr);
1486-
DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
1487-
int n = regmap.size();
1488-
regmap.insert(std::make_pair(vr, n + 1));
1481+
for (unsigned I : llvm::seq(MRI->getNumVirtRegs())) {
1482+
Register VR = Register::index2VirtReg(I);
1483+
if (MRI->use_empty(VR) && MRI->def_empty(VR))
1484+
continue;
1485+
auto &RCRegMap = VRegMapping[MRI->getRegClass(VR)];
1486+
RCRegMap[VR] = RCRegMap.size() + 1;
14891487
}
14901488

14911489
// Emit declaration of the virtual registers or 'physical' registers for

llvm/test/CodeGen/NVPTX/aggregate-return.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ declare {float, float} @bars({float, float} %input)
1010
define void @test_v2f32(<2 x float> %input, ptr %output) {
1111
; CHECK-LABEL: test_v2f32(
1212
; CHECK: {
13-
; CHECK-NEXT: .reg .b64 %rd<5>;
13+
; CHECK-NEXT: .reg .b64 %rd<4>;
1414
; CHECK-EMPTY:
1515
; CHECK-NEXT: // %bb.0:
1616
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0];
@@ -21,8 +21,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
2121
; CHECK-NEXT: call.uni (retval0), barv, (param0);
2222
; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
2323
; CHECK-NEXT: } // callseq 0
24-
; CHECK-NEXT: ld.param.b64 %rd4, [test_v2f32_param_1];
25-
; CHECK-NEXT: st.b64 [%rd4], %rd2;
24+
; CHECK-NEXT: ld.param.b64 %rd3, [test_v2f32_param_1];
25+
; CHECK-NEXT: st.b64 [%rd3], %rd2;
2626
; CHECK-NEXT: ret;
2727
%call = tail call <2 x float> @barv(<2 x float> %input)
2828
store <2 x float> %call, ptr %output, align 8
@@ -32,8 +32,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
3232
define void @test_v3f32(<3 x float> %input, ptr %output) {
3333
; CHECK-LABEL: test_v3f32(
3434
; CHECK: {
35-
; CHECK-NEXT: .reg .b32 %r<4>;
36-
; CHECK-NEXT: .reg .b64 %rd<5>;
35+
; CHECK-NEXT: .reg .b32 %r<3>;
36+
; CHECK-NEXT: .reg .b64 %rd<4>;
3737
; CHECK-EMPTY:
3838
; CHECK-NEXT: // %bb.0:
3939
; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0];
@@ -47,9 +47,9 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
4747
; CHECK-NEXT: ld.param.b32 %r2, [retval0+8];
4848
; CHECK-NEXT: ld.param.b64 %rd2, [retval0];
4949
; CHECK-NEXT: } // callseq 1
50-
; CHECK-NEXT: ld.param.b64 %rd4, [test_v3f32_param_1];
51-
; CHECK-NEXT: st.b32 [%rd4+8], %r2;
52-
; CHECK-NEXT: st.b64 [%rd4], %rd2;
50+
; CHECK-NEXT: ld.param.b64 %rd3, [test_v3f32_param_1];
51+
; CHECK-NEXT: st.b32 [%rd3+8], %r2;
52+
; CHECK-NEXT: st.b64 [%rd3], %rd2;
5353
; CHECK-NEXT: ret;
5454
%call = tail call <3 x float> @barv3(<3 x float> %input)
5555
; Make sure we don't load more values than than we need to.
@@ -60,7 +60,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
6060
define void @test_a2f32([2 x float] %input, ptr %output) {
6161
; CHECK-LABEL: test_a2f32(
6262
; CHECK: {
63-
; CHECK-NEXT: .reg .b32 %r<7>;
63+
; CHECK-NEXT: .reg .b32 %r<5>;
6464
; CHECK-NEXT: .reg .b64 %rd<2>;
6565
; CHECK-EMPTY:
6666
; CHECK-NEXT: // %bb.0:
@@ -87,7 +87,7 @@ define void @test_a2f32([2 x float] %input, ptr %output) {
8787
define void @test_s2f32({float, float} %input, ptr %output) {
8888
; CHECK-LABEL: test_s2f32(
8989
; CHECK: {
90-
; CHECK-NEXT: .reg .b32 %r<7>;
90+
; CHECK-NEXT: .reg .b32 %r<5>;
9191
; CHECK-NEXT: .reg .b64 %rd<2>;
9292
; CHECK-EMPTY:
9393
; CHECK-NEXT: // %bb.0:

llvm/test/CodeGen/NVPTX/atomics-sm70.ll

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -47,90 +47,90 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
4747
; CHECKPTX62: {
4848
; CHECKPTX62-NEXT: .reg .pred %p<5>;
4949
; CHECKPTX62-NEXT: .reg .b16 %rs<11>;
50-
; CHECKPTX62-NEXT: .reg .b32 %r<58>;
50+
; CHECKPTX62-NEXT: .reg .b32 %r<50>;
5151
; CHECKPTX62-EMPTY:
5252
; CHECKPTX62-NEXT: // %bb.0:
5353
; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3];
54-
; CHECKPTX62-NEXT: ld.param.b32 %r23, [test_param_2];
55-
; CHECKPTX62-NEXT: ld.param.b32 %r22, [test_param_1];
56-
; CHECKPTX62-NEXT: ld.param.b32 %r24, [test_param_0];
57-
; CHECKPTX62-NEXT: and.b32 %r1, %r24, -4;
58-
; CHECKPTX62-NEXT: and.b32 %r25, %r24, 3;
59-
; CHECKPTX62-NEXT: shl.b32 %r2, %r25, 3;
60-
; CHECKPTX62-NEXT: mov.b32 %r26, 65535;
61-
; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2;
62-
; CHECKPTX62-NEXT: not.b32 %r3, %r27;
63-
; CHECKPTX62-NEXT: ld.b32 %r54, [%r1];
54+
; CHECKPTX62-NEXT: ld.param.b32 %r15, [test_param_2];
55+
; CHECKPTX62-NEXT: ld.param.b32 %r14, [test_param_1];
56+
; CHECKPTX62-NEXT: ld.param.b32 %r16, [test_param_0];
57+
; CHECKPTX62-NEXT: and.b32 %r1, %r16, -4;
58+
; CHECKPTX62-NEXT: and.b32 %r17, %r16, 3;
59+
; CHECKPTX62-NEXT: shl.b32 %r2, %r17, 3;
60+
; CHECKPTX62-NEXT: mov.b32 %r18, 65535;
61+
; CHECKPTX62-NEXT: shl.b32 %r19, %r18, %r2;
62+
; CHECKPTX62-NEXT: not.b32 %r3, %r19;
63+
; CHECKPTX62-NEXT: ld.b32 %r46, [%r1];
6464
; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45
6565
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
66-
; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2;
67-
; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r28;
66+
; CHECKPTX62-NEXT: shr.u32 %r20, %r46, %r2;
67+
; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r20;
6868
; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1;
69-
; CHECKPTX62-NEXT: cvt.u32.u16 %r29, %rs3;
70-
; CHECKPTX62-NEXT: shl.b32 %r30, %r29, %r2;
71-
; CHECKPTX62-NEXT: and.b32 %r31, %r54, %r3;
72-
; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30;
73-
; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
74-
; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r6, %r54;
75-
; CHECKPTX62-NEXT: mov.b32 %r54, %r6;
69+
; CHECKPTX62-NEXT: cvt.u32.u16 %r21, %rs3;
70+
; CHECKPTX62-NEXT: shl.b32 %r22, %r21, %r2;
71+
; CHECKPTX62-NEXT: and.b32 %r23, %r46, %r3;
72+
; CHECKPTX62-NEXT: or.b32 %r24, %r23, %r22;
73+
; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
74+
; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r4, %r46;
75+
; CHECKPTX62-NEXT: mov.b32 %r46, %r4;
7676
; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1;
7777
; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44
78-
; CHECKPTX62-NEXT: ld.b32 %r55, [%r1];
78+
; CHECKPTX62-NEXT: ld.b32 %r47, [%r1];
7979
; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27
8080
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
81-
; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2;
82-
; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r33;
81+
; CHECKPTX62-NEXT: shr.u32 %r25, %r47, %r2;
82+
; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r25;
8383
; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00;
8484
; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5;
85-
; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs6;
86-
; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r2;
87-
; CHECKPTX62-NEXT: and.b32 %r36, %r55, %r3;
88-
; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35;
89-
; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
90-
; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r9, %r55;
91-
; CHECKPTX62-NEXT: mov.b32 %r55, %r9;
85+
; CHECKPTX62-NEXT: cvt.u32.u16 %r26, %rs6;
86+
; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2;
87+
; CHECKPTX62-NEXT: and.b32 %r28, %r47, %r3;
88+
; CHECKPTX62-NEXT: or.b32 %r29, %r28, %r27;
89+
; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
90+
; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r5, %r47;
91+
; CHECKPTX62-NEXT: mov.b32 %r47, %r5;
9292
; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3;
9393
; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26
94-
; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4;
95-
; CHECKPTX62-NEXT: shl.b32 %r38, %r22, 3;
96-
; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24;
97-
; CHECKPTX62-NEXT: mov.b32 %r39, 65535;
98-
; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11;
99-
; CHECKPTX62-NEXT: not.b32 %r12, %r40;
100-
; CHECKPTX62-NEXT: ld.global.b32 %r56, [%r10];
94+
; CHECKPTX62-NEXT: and.b32 %r6, %r14, -4;
95+
; CHECKPTX62-NEXT: shl.b32 %r30, %r14, 3;
96+
; CHECKPTX62-NEXT: and.b32 %r7, %r30, 24;
97+
; CHECKPTX62-NEXT: mov.b32 %r31, 65535;
98+
; CHECKPTX62-NEXT: shl.b32 %r32, %r31, %r7;
99+
; CHECKPTX62-NEXT: not.b32 %r8, %r32;
100+
; CHECKPTX62-NEXT: ld.global.b32 %r48, [%r6];
101101
; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9
102102
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
103-
; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11;
104-
; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r41;
103+
; CHECKPTX62-NEXT: shr.u32 %r33, %r48, %r7;
104+
; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r33;
105105
; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1;
106-
; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs8;
107-
; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11;
108-
; CHECKPTX62-NEXT: and.b32 %r44, %r56, %r12;
109-
; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43;
110-
; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
111-
; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r15, %r56;
112-
; CHECKPTX62-NEXT: mov.b32 %r56, %r15;
106+
; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs8;
107+
; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r7;
108+
; CHECKPTX62-NEXT: and.b32 %r36, %r48, %r8;
109+
; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35;
110+
; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
111+
; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r9, %r48;
112+
; CHECKPTX62-NEXT: mov.b32 %r48, %r9;
113113
; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5;
114114
; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8
115-
; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4;
116-
; CHECKPTX62-NEXT: shl.b32 %r46, %r23, 3;
117-
; CHECKPTX62-NEXT: and.b32 %r17, %r46, 24;
118-
; CHECKPTX62-NEXT: mov.b32 %r47, 65535;
119-
; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17;
120-
; CHECKPTX62-NEXT: not.b32 %r18, %r48;
121-
; CHECKPTX62-NEXT: ld.shared.b32 %r57, [%r16];
115+
; CHECKPTX62-NEXT: and.b32 %r10, %r15, -4;
116+
; CHECKPTX62-NEXT: shl.b32 %r38, %r15, 3;
117+
; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24;
118+
; CHECKPTX62-NEXT: mov.b32 %r39, 65535;
119+
; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11;
120+
; CHECKPTX62-NEXT: not.b32 %r12, %r40;
121+
; CHECKPTX62-NEXT: ld.shared.b32 %r49, [%r10];
122122
; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start
123123
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
124-
; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17;
125-
; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r49;
124+
; CHECKPTX62-NEXT: shr.u32 %r41, %r49, %r11;
125+
; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r41;
126126
; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1;
127-
; CHECKPTX62-NEXT: cvt.u32.u16 %r50, %rs10;
128-
; CHECKPTX62-NEXT: shl.b32 %r51, %r50, %r17;
129-
; CHECKPTX62-NEXT: and.b32 %r52, %r57, %r18;
130-
; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51;
131-
; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
132-
; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r21, %r57;
133-
; CHECKPTX62-NEXT: mov.b32 %r57, %r21;
127+
; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs10;
128+
; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11;
129+
; CHECKPTX62-NEXT: and.b32 %r44, %r49, %r12;
130+
; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43;
131+
; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
132+
; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r13, %r49;
133+
; CHECKPTX62-NEXT: mov.b32 %r49, %r13;
134134
; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
135135
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
136136
; CHECKPTX62-NEXT: ret;

llvm/test/CodeGen/NVPTX/atomics-sm90.ll

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -47,93 +47,93 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
4747
; CHECKPTX71: {
4848
; CHECKPTX71-NEXT: .reg .pred %p<5>;
4949
; CHECKPTX71-NEXT: .reg .b16 %rs<14>;
50-
; CHECKPTX71-NEXT: .reg .b32 %r<58>;
50+
; CHECKPTX71-NEXT: .reg .b32 %r<50>;
5151
; CHECKPTX71-EMPTY:
5252
; CHECKPTX71-NEXT: // %bb.0:
5353
; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
54-
; CHECKPTX71-NEXT: ld.param.b32 %r23, [test_param_2];
55-
; CHECKPTX71-NEXT: ld.param.b32 %r22, [test_param_1];
56-
; CHECKPTX71-NEXT: ld.param.b32 %r24, [test_param_0];
57-
; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
58-
; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
59-
; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
60-
; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
61-
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
62-
; CHECKPTX71-NEXT: not.b32 %r3, %r27;
63-
; CHECKPTX71-NEXT: ld.b32 %r54, [%r1];
54+
; CHECKPTX71-NEXT: ld.param.b32 %r15, [test_param_2];
55+
; CHECKPTX71-NEXT: ld.param.b32 %r14, [test_param_1];
56+
; CHECKPTX71-NEXT: ld.param.b32 %r16, [test_param_0];
57+
; CHECKPTX71-NEXT: and.b32 %r1, %r16, -4;
58+
; CHECKPTX71-NEXT: and.b32 %r17, %r16, 3;
59+
; CHECKPTX71-NEXT: shl.b32 %r2, %r17, 3;
60+
; CHECKPTX71-NEXT: mov.b32 %r18, 65535;
61+
; CHECKPTX71-NEXT: shl.b32 %r19, %r18, %r2;
62+
; CHECKPTX71-NEXT: not.b32 %r3, %r19;
63+
; CHECKPTX71-NEXT: ld.b32 %r46, [%r1];
6464
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
6565
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
66-
; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
67-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
66+
; CHECKPTX71-NEXT: shr.u32 %r20, %r46, %r2;
67+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r20;
6868
; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80;
6969
; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
70-
; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
71-
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
72-
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
73-
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
74-
; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
75-
; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r6, %r54;
76-
; CHECKPTX71-NEXT: mov.b32 %r54, %r6;
70+
; CHECKPTX71-NEXT: cvt.u32.u16 %r21, %rs4;
71+
; CHECKPTX71-NEXT: shl.b32 %r22, %r21, %r2;
72+
; CHECKPTX71-NEXT: and.b32 %r23, %r46, %r3;
73+
; CHECKPTX71-NEXT: or.b32 %r24, %r23, %r22;
74+
; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
75+
; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r4, %r46;
76+
; CHECKPTX71-NEXT: mov.b32 %r46, %r4;
7777
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
7878
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
79-
; CHECKPTX71-NEXT: ld.b32 %r55, [%r1];
79+
; CHECKPTX71-NEXT: ld.b32 %r47, [%r1];
8080
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
8181
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
82-
; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
83-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33;
82+
; CHECKPTX71-NEXT: shr.u32 %r25, %r47, %r2;
83+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r25;
8484
; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80;
8585
; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
86-
; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7;
87-
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
88-
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
89-
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
90-
; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
91-
; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r9, %r55;
92-
; CHECKPTX71-NEXT: mov.b32 %r55, %r9;
86+
; CHECKPTX71-NEXT: cvt.u32.u16 %r26, %rs7;
87+
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
88+
; CHECKPTX71-NEXT: and.b32 %r28, %r47, %r3;
89+
; CHECKPTX71-NEXT: or.b32 %r29, %r28, %r27;
90+
; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
91+
; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r5, %r47;
92+
; CHECKPTX71-NEXT: mov.b32 %r47, %r5;
9393
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
9494
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
95-
; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
96-
; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
97-
; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
98-
; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
99-
; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
100-
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
101-
; CHECKPTX71-NEXT: ld.global.b32 %r56, [%r10];
95+
; CHECKPTX71-NEXT: and.b32 %r6, %r14, -4;
96+
; CHECKPTX71-NEXT: shl.b32 %r30, %r14, 3;
97+
; CHECKPTX71-NEXT: and.b32 %r7, %r30, 24;
98+
; CHECKPTX71-NEXT: mov.b32 %r31, 65535;
99+
; CHECKPTX71-NEXT: shl.b32 %r32, %r31, %r7;
100+
; CHECKPTX71-NEXT: not.b32 %r8, %r32;
101+
; CHECKPTX71-NEXT: ld.global.b32 %r48, [%r6];
102102
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
103103
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
104-
; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
105-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41;
104+
; CHECKPTX71-NEXT: shr.u32 %r33, %r48, %r7;
105+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r33;
106106
; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80;
107107
; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
108-
; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10;
109-
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
110-
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
111-
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
112-
; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
113-
; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r15, %r56;
114-
; CHECKPTX71-NEXT: mov.b32 %r56, %r15;
108+
; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs10;
109+
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r7;
110+
; CHECKPTX71-NEXT: and.b32 %r36, %r48, %r8;
111+
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
112+
; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
113+
; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r9, %r48;
114+
; CHECKPTX71-NEXT: mov.b32 %r48, %r9;
115115
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
116116
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
117-
; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
118-
; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
119-
; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
120-
; CHECKPTX71-NEXT: mov.b32 %r47, 65535;
121-
; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17;
122-
; CHECKPTX71-NEXT: not.b32 %r18, %r48;
123-
; CHECKPTX71-NEXT: ld.shared.b32 %r57, [%r16];
117+
; CHECKPTX71-NEXT: and.b32 %r10, %r15, -4;
118+
; CHECKPTX71-NEXT: shl.b32 %r38, %r15, 3;
119+
; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
120+
; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
121+
; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
122+
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
123+
; CHECKPTX71-NEXT: ld.shared.b32 %r49, [%r10];
124124
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
125125
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
126-
; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
127-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49;
126+
; CHECKPTX71-NEXT: shr.u32 %r41, %r49, %r11;
127+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r41;
128128
; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80;
129129
; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
130-
; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13;
131-
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
132-
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
133-
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
134-
; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
135-
; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r21, %r57;
136-
; CHECKPTX71-NEXT: mov.b32 %r57, %r21;
130+
; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs13;
131+
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
132+
; CHECKPTX71-NEXT: and.b32 %r44, %r49, %r12;
133+
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
134+
; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
135+
; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r13, %r49;
136+
; CHECKPTX71-NEXT: mov.b32 %r49, %r13;
137137
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
138138
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
139139
; CHECKPTX71-NEXT: ret;

0 commit comments

Comments
 (0)