@@ -5111,9 +5111,12 @@ static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5111
5111
case AMDGPU::S_SUB_I32:
5112
5112
case AMDGPU::S_SUB_U64_PSEUDO:
5113
5113
case AMDGPU::S_OR_B32:
5114
+ case AMDGPU::S_OR_B64:
5114
5115
case AMDGPU::S_XOR_B32:
5116
+ case AMDGPU::S_XOR_B64:
5115
5117
return std::numeric_limits<uint32_t>::min();
5116
5118
case AMDGPU::S_AND_B32:
5119
+ case AMDGPU::S_AND_B64:
5117
5120
return std::numeric_limits<uint32_t>::max();
5118
5121
default:
5119
5122
llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
@@ -5142,7 +5145,9 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5142
5145
case AMDGPU::S_MAX_U32:
5143
5146
case AMDGPU::S_MAX_I32:
5144
5147
case AMDGPU::S_AND_B32:
5145
- case AMDGPU::S_OR_B32: {
5148
+ case AMDGPU::S_AND_B64:
5149
+ case AMDGPU::S_OR_B32:
5150
+ case AMDGPU::S_OR_B64: {
5146
5151
// Idempotent operations.
5147
5152
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5148
5153
RetBB = &BB;
@@ -5159,6 +5164,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5159
5164
break;
5160
5165
}
5161
5166
case AMDGPU::S_XOR_B32:
5167
+ case AMDGPU::S_XOR_B64:
5162
5168
case AMDGPU::S_ADD_I32:
5163
5169
case AMDGPU::S_ADD_U64_PSEUDO:
5164
5170
case AMDGPU::S_SUB_I32:
@@ -5181,24 +5187,69 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5181
5187
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5182
5188
.addReg(ExecMask);
5183
5189
5184
- switch (Opc) {
5185
- case AMDGPU::S_XOR_B32: {
5186
- // Performing an XOR operation on a uniform value
5187
- // depends on the parity of the number of active lanes.
5188
- // For even parity, the result will be 0, for odd
5189
- // parity the result will be the same as the input value.
5190
- Register ParityRegister =
5191
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5192
-
5193
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5194
- .addReg(NewAccumulator->getOperand(0).getReg())
5195
- .addImm(1)
5196
- .setOperandDead(3); // Dead scc
5197
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5198
- .addReg(SrcReg)
5199
- .addReg(ParityRegister);
5200
- break;
5201
- }
5190
+ switch (Opc) {
5191
+ case AMDGPU::S_XOR_B32:
5192
+ case AMDGPU::S_XOR_B64: {
5193
+ // Performing an XOR operation on a uniform value
5194
+ // depends on the parity of the number of active lanes.
5195
+ // For even parity, the result will be 0, for odd
5196
+ // parity the result will be the same as the input value.
5197
+ Register ParityRegister =
5198
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5199
+
5200
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5201
+ .addReg(NewAccumulator->getOperand(0).getReg())
5202
+ .addImm(1)
5203
+ .setOperandDead(3); // Dead scc
5204
+ if (is32BitOpc) {
5205
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5206
+ .addReg(SrcReg)
5207
+ .addReg(ParityRegister);
5208
+ break;
5209
+ } else {
5210
+ Register DestSub0 =
5211
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5212
+ Register DestSub1 =
5213
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5214
+ Register Op1H_Op0L_Reg =
5215
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5216
+ Register CarryReg =
5217
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5218
+
5219
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5220
+ const TargetRegisterClass *SrcSubRC =
5221
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5222
+
5223
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5224
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5225
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5226
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5227
+
5228
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5229
+ .add(Op1L)
5230
+ .addReg(ParityRegister);
5231
+
5232
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5233
+ .add(Op1H)
5234
+ .addReg(ParityRegister);
5235
+
5236
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5237
+ .add(Op1L)
5238
+ .addReg(ParityRegister);
5239
+
5240
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5241
+ .addReg(CarryReg)
5242
+ .addReg(Op1H_Op0L_Reg)
5243
+ .setOperandDead(3); // Dead scc
5244
+
5245
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5246
+ .addReg(DestSub0)
5247
+ .addImm(AMDGPU::sub0)
5248
+ .addReg(DestSub1)
5249
+ .addImm(AMDGPU::sub1);
5250
+ break;
5251
+ }
5252
+ }
5202
5253
case AMDGPU::S_SUB_I32: {
5203
5254
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5204
5255
@@ -5412,6 +5463,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5412
5463
.addReg(LaneValueHiReg)
5413
5464
.addImm(AMDGPU::sub1);
5414
5465
switch (Opc) {
5466
+ case ::AMDGPU::S_OR_B64:
5467
+ case ::AMDGPU::S_AND_B64:
5468
+ case ::AMDGPU::S_XOR_B64: {
5469
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5470
+ .addReg(Accumulator->getOperand(0).getReg())
5471
+ .addReg(LaneValue->getOperand(0).getReg())
5472
+ .setOperandDead(3); // Dead scc
5473
+ break;
5474
+ }
5415
5475
case AMDGPU::V_CMP_GT_I64_e64:
5416
5476
case AMDGPU::V_CMP_GT_U64_e64:
5417
5477
case AMDGPU::V_CMP_LT_I64_e64:
@@ -5541,10 +5601,16 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5541
5601
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5542
5602
case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5543
5603
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5604
+ case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5605
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5544
5606
case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5545
5607
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5608
+ case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5609
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5546
5610
case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5547
5611
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5612
+ case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5613
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5548
5614
case AMDGPU::S_UADDO_PSEUDO:
5549
5615
case AMDGPU::S_USUBO_PSEUDO: {
5550
5616
const DebugLoc &DL = MI.getDebugLoc();
0 commit comments