Skip to content

Commit 11fba35

Browse files
authored
[NVPTX] Add SimplifyDemandedBitsForTargetNode for PRMT (#149395)
1 parent a69cdde commit 11fba35

File tree

5 files changed

+248
-162
lines changed

5 files changed

+248
-162
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6573,3 +6573,102 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode(
65736573
break;
65746574
}
65756575
}
6576+
6577+
static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6578+
const APInt &DemandedBits) {
6579+
APInt DemandedLHS = APInt(32, 0);
6580+
APInt DemandedRHS = APInt(32, 0);
6581+
6582+
for (unsigned I : llvm::seq(4)) {
6583+
if (DemandedBits.extractBits(8, I * 8).isZero())
6584+
continue;
6585+
6586+
APInt Sel = SelectorVal.extractBits(4, I * 4);
6587+
unsigned Idx = Sel.getLoBits(3).getZExtValue();
6588+
unsigned Sign = Sel.getHiBits(1).getZExtValue();
6589+
6590+
APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6591+
unsigned ByteStart = (Idx % 4) * 8;
6592+
if (Sign)
6593+
Src.setBit(ByteStart + 7);
6594+
else
6595+
Src.setBits(ByteStart, ByteStart + 8);
6596+
}
6597+
6598+
return {DemandedLHS, DemandedRHS};
6599+
}
6600+
6601+
// Replace undef with 0 as this is easier for other optimizations such as
6602+
// known bits.
6603+
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG) {
6604+
if (!Op)
6605+
return SDValue();
6606+
if (Op.isUndef())
6607+
return DAG.getConstant(0, SDLoc(), MVT::i32);
6608+
return Op;
6609+
}
6610+
6611+
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT,
6612+
const APInt &DemandedBits,
6613+
SelectionDAG &DAG,
6614+
const TargetLowering &TLI,
6615+
unsigned Depth) {
6616+
assert(PRMT.getOpcode() == NVPTXISD::PRMT);
6617+
SDValue Op0 = PRMT.getOperand(0);
6618+
SDValue Op1 = PRMT.getOperand(1);
6619+
auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
6620+
if (!SelectorConst)
6621+
return SDValue();
6622+
6623+
unsigned Mode = PRMT.getConstantOperandVal(3);
6624+
const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
6625+
6626+
// Try to simplify the PRMT to one of the inputs if the used bytes are all
6627+
// from the same input in the correct order.
6628+
const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
6629+
const unsigned SelBits = (4 - LeadingBytes) * 4;
6630+
if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
6631+
return Op0;
6632+
if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
6633+
return Op1;
6634+
6635+
auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
6636+
6637+
// Attempt to avoid multi-use ops if we don't need anything from them.
6638+
SDValue DemandedOp0 =
6639+
TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
6640+
SDValue DemandedOp1 =
6641+
TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
6642+
6643+
DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
6644+
DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
6645+
if ((DemandedOp0 && DemandedOp0 != Op0) ||
6646+
(DemandedOp1 && DemandedOp1 != Op1)) {
6647+
Op0 = DemandedOp0 ? DemandedOp0 : Op0;
6648+
Op1 = DemandedOp1 ? DemandedOp1 : Op1;
6649+
return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
6650+
}
6651+
6652+
return SDValue();
6653+
}
6654+
6655+
bool NVPTXTargetLowering::SimplifyDemandedBitsForTargetNode(
6656+
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
6657+
KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
6658+
Known.resetAll();
6659+
6660+
switch (Op.getOpcode()) {
6661+
case NVPTXISD::PRMT:
6662+
if (SDValue Result = simplifyDemandedBitsForPRMT(Op, DemandedBits, TLO.DAG,
6663+
*this, Depth)) {
6664+
TLO.CombineTo(Op, Result);
6665+
return true;
6666+
}
6667+
break;
6668+
default:
6669+
break;
6670+
}
6671+
6672+
computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
6673+
return false;
6674+
}

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,11 @@ class NVPTXTargetLowering : public TargetLowering {
275275
const APInt &DemandedElts,
276276
const SelectionDAG &DAG,
277277
unsigned Depth = 0) const override;
278+
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
279+
const APInt &DemandedElts,
280+
KnownBits &Known,
281+
TargetLoweringOpt &TLO,
282+
unsigned Depth = 0) const override;
278283

279284
private:
280285
const NVPTXSubtarget &STI; // cache the subtarget here

llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -181,32 +181,32 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
181181
; ENABLED-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
182182
; ENABLED-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U;
183183
; ENABLED-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U;
184-
; ENABLED-NEXT: prmt.b32 %r8, %r4, 0, 0x7770U;
185-
; ENABLED-NEXT: prmt.b32 %r9, %r3, 0, 0x7773U;
186-
; ENABLED-NEXT: prmt.b32 %r10, %r3, 0, 0x7772U;
187-
; ENABLED-NEXT: prmt.b32 %r11, %r3, 0, 0x7771U;
188-
; ENABLED-NEXT: prmt.b32 %r12, %r3, 0, 0x7770U;
189-
; ENABLED-NEXT: prmt.b32 %r13, %r2, 0, 0x7773U;
190-
; ENABLED-NEXT: prmt.b32 %r14, %r2, 0, 0x7772U;
191-
; ENABLED-NEXT: prmt.b32 %r15, %r2, 0, 0x7771U;
192-
; ENABLED-NEXT: prmt.b32 %r16, %r2, 0, 0x7770U;
193-
; ENABLED-NEXT: prmt.b32 %r17, %r1, 0, 0x7773U;
194-
; ENABLED-NEXT: prmt.b32 %r18, %r1, 0, 0x7772U;
195-
; ENABLED-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U;
196-
; ENABLED-NEXT: prmt.b32 %r20, %r1, 0, 0x7770U;
184+
; ENABLED-NEXT: prmt.b32 %r8, %r3, 0, 0x7773U;
185+
; ENABLED-NEXT: prmt.b32 %r9, %r3, 0, 0x7772U;
186+
; ENABLED-NEXT: prmt.b32 %r10, %r3, 0, 0x7771U;
187+
; ENABLED-NEXT: prmt.b32 %r11, %r2, 0, 0x7773U;
188+
; ENABLED-NEXT: prmt.b32 %r12, %r2, 0, 0x7772U;
189+
; ENABLED-NEXT: prmt.b32 %r13, %r2, 0, 0x7771U;
190+
; ENABLED-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U;
191+
; ENABLED-NEXT: prmt.b32 %r15, %r1, 0, 0x7772U;
192+
; ENABLED-NEXT: prmt.b32 %r16, %r1, 0, 0x7771U;
197193
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1];
198-
; ENABLED-NEXT: add.s32 %r21, %r20, %r19;
199-
; ENABLED-NEXT: add.s32 %r22, %r21, %r18;
200-
; ENABLED-NEXT: add.s32 %r23, %r22, %r17;
201-
; ENABLED-NEXT: add.s32 %r24, %r23, %r16;
202-
; ENABLED-NEXT: add.s32 %r25, %r24, %r15;
203-
; ENABLED-NEXT: add.s32 %r26, %r25, %r14;
204-
; ENABLED-NEXT: add.s32 %r27, %r26, %r13;
205-
; ENABLED-NEXT: add.s32 %r28, %r27, %r12;
206-
; ENABLED-NEXT: add.s32 %r29, %r28, %r11;
207-
; ENABLED-NEXT: add.s32 %r30, %r29, %r10;
208-
; ENABLED-NEXT: add.s32 %r31, %r30, %r9;
209-
; ENABLED-NEXT: add.s32 %r32, %r31, %r8;
194+
; ENABLED-NEXT: and.b32 %r17, %r1, 255;
195+
; ENABLED-NEXT: and.b32 %r18, %r2, 255;
196+
; ENABLED-NEXT: and.b32 %r19, %r3, 255;
197+
; ENABLED-NEXT: and.b32 %r20, %r4, 255;
198+
; ENABLED-NEXT: add.s32 %r21, %r17, %r16;
199+
; ENABLED-NEXT: add.s32 %r22, %r21, %r15;
200+
; ENABLED-NEXT: add.s32 %r23, %r22, %r14;
201+
; ENABLED-NEXT: add.s32 %r24, %r23, %r18;
202+
; ENABLED-NEXT: add.s32 %r25, %r24, %r13;
203+
; ENABLED-NEXT: add.s32 %r26, %r25, %r12;
204+
; ENABLED-NEXT: add.s32 %r27, %r26, %r11;
205+
; ENABLED-NEXT: add.s32 %r28, %r27, %r19;
206+
; ENABLED-NEXT: add.s32 %r29, %r28, %r10;
207+
; ENABLED-NEXT: add.s32 %r30, %r29, %r9;
208+
; ENABLED-NEXT: add.s32 %r31, %r30, %r8;
209+
; ENABLED-NEXT: add.s32 %r32, %r31, %r20;
210210
; ENABLED-NEXT: add.s32 %r33, %r32, %r7;
211211
; ENABLED-NEXT: add.s32 %r34, %r33, %r6;
212212
; ENABLED-NEXT: add.s32 %r35, %r34, %r5;
@@ -332,36 +332,36 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
332332
; ENABLED-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
333333
; ENABLED-NEXT: prmt.b32 %r4, %r2, 0, 0x7772U;
334334
; ENABLED-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U;
335-
; ENABLED-NEXT: prmt.b32 %r6, %r2, 0, 0x7770U;
336-
; ENABLED-NEXT: prmt.b32 %r7, %r1, 0, 0x7773U;
337-
; ENABLED-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U;
338-
; ENABLED-NEXT: prmt.b32 %r9, %r1, 0, 0x7771U;
339-
; ENABLED-NEXT: prmt.b32 %r10, %r1, 0, 0x7770U;
335+
; ENABLED-NEXT: prmt.b32 %r6, %r1, 0, 0x7773U;
336+
; ENABLED-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U;
337+
; ENABLED-NEXT: prmt.b32 %r8, %r1, 0, 0x7771U;
340338
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
341-
; ENABLED-NEXT: ld.v2.b32 {%r11, %r12}, [%rd1+8];
342-
; ENABLED-NEXT: prmt.b32 %r13, %r12, 0, 0x7773U;
343-
; ENABLED-NEXT: prmt.b32 %r14, %r12, 0, 0x7772U;
344-
; ENABLED-NEXT: prmt.b32 %r15, %r12, 0, 0x7771U;
345-
; ENABLED-NEXT: prmt.b32 %r16, %r12, 0, 0x7770U;
346-
; ENABLED-NEXT: prmt.b32 %r17, %r11, 0, 0x7773U;
347-
; ENABLED-NEXT: prmt.b32 %r18, %r11, 0, 0x7772U;
348-
; ENABLED-NEXT: prmt.b32 %r19, %r11, 0, 0x7771U;
349-
; ENABLED-NEXT: prmt.b32 %r20, %r11, 0, 0x7770U;
350-
; ENABLED-NEXT: add.s32 %r21, %r10, %r9;
351-
; ENABLED-NEXT: add.s32 %r22, %r21, %r8;
352-
; ENABLED-NEXT: add.s32 %r23, %r22, %r7;
353-
; ENABLED-NEXT: add.s32 %r24, %r23, %r6;
339+
; ENABLED-NEXT: ld.v2.b32 {%r9, %r10}, [%rd1+8];
340+
; ENABLED-NEXT: prmt.b32 %r11, %r10, 0, 0x7773U;
341+
; ENABLED-NEXT: prmt.b32 %r12, %r10, 0, 0x7772U;
342+
; ENABLED-NEXT: prmt.b32 %r13, %r10, 0, 0x7771U;
343+
; ENABLED-NEXT: prmt.b32 %r14, %r9, 0, 0x7773U;
344+
; ENABLED-NEXT: prmt.b32 %r15, %r9, 0, 0x7772U;
345+
; ENABLED-NEXT: prmt.b32 %r16, %r9, 0, 0x7771U;
346+
; ENABLED-NEXT: and.b32 %r17, %r1, 255;
347+
; ENABLED-NEXT: and.b32 %r18, %r2, 255;
348+
; ENABLED-NEXT: and.b32 %r19, %r9, 255;
349+
; ENABLED-NEXT: and.b32 %r20, %r10, 255;
350+
; ENABLED-NEXT: add.s32 %r21, %r17, %r8;
351+
; ENABLED-NEXT: add.s32 %r22, %r21, %r7;
352+
; ENABLED-NEXT: add.s32 %r23, %r22, %r6;
353+
; ENABLED-NEXT: add.s32 %r24, %r23, %r18;
354354
; ENABLED-NEXT: add.s32 %r25, %r24, %r5;
355355
; ENABLED-NEXT: add.s32 %r26, %r25, %r4;
356356
; ENABLED-NEXT: add.s32 %r27, %r26, %r3;
357-
; ENABLED-NEXT: add.s32 %r28, %r27, %r20;
358-
; ENABLED-NEXT: add.s32 %r29, %r28, %r19;
359-
; ENABLED-NEXT: add.s32 %r30, %r29, %r18;
360-
; ENABLED-NEXT: add.s32 %r31, %r30, %r17;
361-
; ENABLED-NEXT: add.s32 %r32, %r31, %r16;
362-
; ENABLED-NEXT: add.s32 %r33, %r32, %r15;
363-
; ENABLED-NEXT: add.s32 %r34, %r33, %r14;
364-
; ENABLED-NEXT: add.s32 %r35, %r34, %r13;
357+
; ENABLED-NEXT: add.s32 %r28, %r27, %r19;
358+
; ENABLED-NEXT: add.s32 %r29, %r28, %r16;
359+
; ENABLED-NEXT: add.s32 %r30, %r29, %r15;
360+
; ENABLED-NEXT: add.s32 %r31, %r30, %r14;
361+
; ENABLED-NEXT: add.s32 %r32, %r31, %r20;
362+
; ENABLED-NEXT: add.s32 %r33, %r32, %r13;
363+
; ENABLED-NEXT: add.s32 %r34, %r33, %r12;
364+
; ENABLED-NEXT: add.s32 %r35, %r34, %r11;
365365
; ENABLED-NEXT: st.b32 [%rd2], %r35;
366366
; ENABLED-NEXT: ret;
367367
;

llvm/test/CodeGen/NVPTX/extractelement.ll

Lines changed: 34 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -56,23 +56,22 @@ define i16 @test_v4i8(i32 %a) {
5656
; CHECK-LABEL: test_v4i8(
5757
; CHECK: {
5858
; CHECK-NEXT: .reg .b16 %rs<8>;
59-
; CHECK-NEXT: .reg .b32 %r<7>;
59+
; CHECK-NEXT: .reg .b32 %r<6>;
6060
; CHECK-EMPTY:
6161
; CHECK-NEXT: // %bb.0:
6262
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0];
63-
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U;
64-
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
65-
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
66-
; CHECK-NEXT: cvt.u16.u32 %rs2, %r3;
67-
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
68-
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
69-
; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
70-
; CHECK-NEXT: cvt.u16.u32 %rs4, %r5;
63+
; CHECK-NEXT: cvt.s8.s32 %rs1, %r1;
64+
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x9991U;
65+
; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
66+
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0xaaa2U;
67+
; CHECK-NEXT: cvt.u16.u32 %rs3, %r3;
68+
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U;
69+
; CHECK-NEXT: cvt.u16.u32 %rs4, %r4;
7170
; CHECK-NEXT: add.s16 %rs5, %rs1, %rs2;
7271
; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4;
7372
; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6;
74-
; CHECK-NEXT: cvt.u32.u16 %r6, %rs7;
75-
; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
73+
; CHECK-NEXT: cvt.u32.u16 %r5, %rs7;
74+
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
7675
; CHECK-NEXT: ret;
7776
%v = bitcast i32 %a to <4 x i8>
7877
%r0 = extractelement <4 x i8> %v, i64 0
@@ -96,7 +95,7 @@ define i32 @test_v4i8_s32(i32 %a) {
9695
; CHECK-EMPTY:
9796
; CHECK-NEXT: // %bb.0:
9897
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_s32_param_0];
99-
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x8880U;
98+
; CHECK-NEXT: cvt.s32.s8 %r2, %r1;
10099
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
101100
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
102101
; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
@@ -127,12 +126,12 @@ define i32 @test_v4i8_u32(i32 %a) {
127126
; CHECK-EMPTY:
128127
; CHECK-NEXT: // %bb.0:
129128
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_u32_param_0];
130-
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7770U;
131-
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7771U;
132-
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
133-
; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7773U;
134-
; CHECK-NEXT: add.s32 %r6, %r2, %r3;
135-
; CHECK-NEXT: add.s32 %r7, %r4, %r5;
129+
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7771U;
130+
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U;
131+
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U;
132+
; CHECK-NEXT: and.b32 %r5, %r1, 255;
133+
; CHECK-NEXT: add.s32 %r6, %r5, %r2;
134+
; CHECK-NEXT: add.s32 %r7, %r3, %r4;
136135
; CHECK-NEXT: add.s32 %r8, %r6, %r7;
137136
; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
138137
; CHECK-NEXT: ret;
@@ -157,35 +156,33 @@ define i16 @test_v8i8(i64 %a) {
157156
; CHECK-LABEL: test_v8i8(
158157
; CHECK: {
159158
; CHECK-NEXT: .reg .b16 %rs<16>;
160-
; CHECK-NEXT: .reg .b32 %r<12>;
159+
; CHECK-NEXT: .reg .b32 %r<10>;
161160
; CHECK-EMPTY:
162161
; CHECK-NEXT: // %bb.0:
163162
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0];
164-
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x8880U;
165-
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
166-
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U;
167-
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
168-
; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xaaa2U;
169-
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
170-
; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0xbbb3U;
171-
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
172-
; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x8880U;
173-
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
174-
; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x9991U;
175-
; CHECK-NEXT: cvt.u16.u32 %rs6, %r8;
176-
; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0xaaa2U;
177-
; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
178-
; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0xbbb3U;
179-
; CHECK-NEXT: cvt.u16.u32 %rs8, %r10;
163+
; CHECK-NEXT: cvt.s8.s32 %rs1, %r1;
164+
; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x9991U;
165+
; CHECK-NEXT: cvt.u16.u32 %rs2, %r3;
166+
; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
167+
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
168+
; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0xbbb3U;
169+
; CHECK-NEXT: cvt.u16.u32 %rs4, %r5;
170+
; CHECK-NEXT: cvt.s8.s32 %rs5, %r2;
171+
; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x9991U;
172+
; CHECK-NEXT: cvt.u16.u32 %rs6, %r6;
173+
; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U;
174+
; CHECK-NEXT: cvt.u16.u32 %rs7, %r7;
175+
; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0xbbb3U;
176+
; CHECK-NEXT: cvt.u16.u32 %rs8, %r8;
180177
; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
181178
; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4;
182179
; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6;
183180
; CHECK-NEXT: add.s16 %rs12, %rs7, %rs8;
184181
; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10;
185182
; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12;
186183
; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14;
187-
; CHECK-NEXT: cvt.u32.u16 %r11, %rs15;
188-
; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
184+
; CHECK-NEXT: cvt.u32.u16 %r9, %rs15;
185+
; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
189186
; CHECK-NEXT: ret;
190187
%v = bitcast i64 %a to <8 x i8>
191188
%r0 = extractelement <8 x i8> %v, i64 0

0 commit comments

Comments
 (0)