Skip to content

Commit c996130

Browse files
committed
[AMDGPU] Extending wave reduction intrinsics for i64 types - 3
Supporting Arithemtic Operations: `and`, `or`, `xor`
1 parent 41cff78 commit c996130

File tree

5 files changed

+3210
-19
lines changed

5 files changed

+3210
-19
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 85 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5111,9 +5111,12 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
51115111
case AMDGPU::S_SUB_I32:
51125112
case AMDGPU::S_SUB_U64_PSEUDO:
51135113
case AMDGPU::S_OR_B32:
5114+
case AMDGPU::S_OR_B64:
51145115
case AMDGPU::S_XOR_B32:
5116+
case AMDGPU::S_XOR_B64:
51155117
return std::numeric_limits<uint32_t>::min();
51165118
case AMDGPU::S_AND_B32:
5119+
case AMDGPU::S_AND_B64:
51175120
return std::numeric_limits<uint32_t>::max();
51185121
default:
51195122
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
@@ -5142,7 +5145,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51425145
case AMDGPU::S_MAX_U32:
51435146
case AMDGPU::S_MAX_I32:
51445147
case AMDGPU::S_AND_B32:
5145-
case AMDGPU::S_OR_B32: {
5148+
case AMDGPU::S_AND_B64:
5149+
case AMDGPU::S_OR_B32:
5150+
case AMDGPU::S_OR_B64: {
51465151
// Idempotent operations.
51475152
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
51485153
RetBB = &BB;
@@ -5159,6 +5164,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51595164
break;
51605165
}
51615166
case AMDGPU::S_XOR_B32:
5167+
case AMDGPU::S_XOR_B64:
51625168
case AMDGPU::S_ADD_I32:
51635169
case AMDGPU::S_ADD_U64_PSEUDO:
51645170
case AMDGPU::S_SUB_I32:
@@ -5181,24 +5187,69 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
51815187
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
51825188
.addReg(ExecMask);
51835189

5184-
switch (Opc) {
5185-
case AMDGPU::S_XOR_B32: {
5186-
// Performing an XOR operation on a uniform value
5187-
// depends on the parity of the number of active lanes.
5188-
// For even parity, the result will be 0, for odd
5189-
// parity the result will be the same as the input value.
5190-
Register ParityRegister =
5191-
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5192-
5193-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5194-
.addReg(NewAccumulator->getOperand(0).getReg())
5195-
.addImm(1)
5196-
.setOperandDead(3); // Dead scc
5197-
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5198-
.addReg(SrcReg)
5199-
.addReg(ParityRegister);
5200-
break;
5201-
}
5190+
switch (Opc) {
5191+
case AMDGPU::S_XOR_B32:
5192+
case AMDGPU::S_XOR_B64: {
5193+
// Performing an XOR operation on a uniform value
5194+
// depends on the parity of the number of active lanes.
5195+
// For even parity, the result will be 0, for odd
5196+
// parity the result will be the same as the input value.
5197+
Register ParityRegister =
5198+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5199+
5200+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5201+
.addReg(NewAccumulator->getOperand(0).getReg())
5202+
.addImm(1)
5203+
.setOperandDead(3); // Dead scc
5204+
if (is32BitOpc) {
5205+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5206+
.addReg(SrcReg)
5207+
.addReg(ParityRegister);
5208+
break;
5209+
} else {
5210+
Register DestSub0 =
5211+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5212+
Register DestSub1 =
5213+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5214+
Register Op1H_Op0L_Reg =
5215+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5216+
Register CarryReg =
5217+
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5218+
5219+
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5220+
const TargetRegisterClass *SrcSubRC =
5221+
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5222+
5223+
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5224+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5225+
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5226+
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5227+
5228+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5229+
.add(Op1L)
5230+
.addReg(ParityRegister);
5231+
5232+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5233+
.add(Op1H)
5234+
.addReg(ParityRegister);
5235+
5236+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5237+
.add(Op1L)
5238+
.addReg(ParityRegister);
5239+
5240+
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5241+
.addReg(CarryReg)
5242+
.addReg(Op1H_Op0L_Reg)
5243+
.setOperandDead(3); // Dead scc
5244+
5245+
BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5246+
.addReg(DestSub0)
5247+
.addImm(AMDGPU::sub0)
5248+
.addReg(DestSub1)
5249+
.addImm(AMDGPU::sub1);
5250+
break;
5251+
}
5252+
}
52025253
case AMDGPU::S_SUB_I32: {
52035254
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
52045255

@@ -5412,6 +5463,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
54125463
.addReg(LaneValueHiReg)
54135464
.addImm(AMDGPU::sub1);
54145465
switch (Opc) {
5466+
case ::AMDGPU::S_OR_B64:
5467+
case ::AMDGPU::S_AND_B64:
5468+
case ::AMDGPU::S_XOR_B64: {
5469+
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5470+
.addReg(Accumulator->getOperand(0).getReg())
5471+
.addReg(LaneValue->getOperand(0).getReg())
5472+
.setOperandDead(3); // Dead scc
5473+
break;
5474+
}
54155475
case AMDGPU::V_CMP_GT_I64_e64:
54165476
case AMDGPU::V_CMP_GT_U64_e64:
54175477
case AMDGPU::V_CMP_LT_I64_e64:
@@ -5541,10 +5601,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
55415601
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
55425602
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
55435603
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5604+
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5605+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
55445606
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
55455607
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5608+
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5609+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
55465610
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
55475611
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5612+
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5613+
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
55485614
case AMDGPU::S_UADDO_PSEUDO:
55495615
case AMDGPU::S_USUBO_PSEUDO: {
55505616
const DebugLoc &DL = MI.getDebugLoc();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,9 @@ defvar Operations = [
347347
WaveReduceOp<"max", "I64", i64, SGPR_64, VSrc_b64>,
348348
WaveReduceOp<"add", "U64", i64, SGPR_64, VSrc_b64>,
349349
WaveReduceOp<"sub", "U64", i64, SGPR_64, VSrc_b64>,
350+
WaveReduceOp<"and", "B64", i64, SGPR_64, VSrc_b64>,
351+
WaveReduceOp<"or", "B64", i64, SGPR_64, VSrc_b64>,
352+
WaveReduceOp<"xor", "B64", i64, SGPR_64, VSrc_b64>,
350353
];
351354

352355
foreach Op = Operations in {

0 commit comments

Comments
 (0)