@@ -5176,31 +5176,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5176
5176
unsigned CountReg =
5177
5177
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5178
5178
5179
- auto Exec =
5180
5179
BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5181
5180
5182
- auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5183
- .addReg(Exec->getOperand(0).getReg());
5181
+ auto NewAccumulator =
5182
+ BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes)
5183
+ .addReg(ExecMask);
5184
5184
5185
- switch (Opc) {
5186
- case AMDGPU::S_XOR_B32:
5187
- case AMDGPU::S_XOR_B64: {
5188
- // Performing an XOR operation on a uniform value
5189
- // depends on the parity of the number of active lanes.
5190
- // For even parity, the result will be 0, for odd
5191
- // parity the result will be the same as the input value.
5192
- Register ParityRegister =
5193
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5185
+ switch (Opc) {
5186
+ case AMDGPU::S_XOR_B32:
5187
+ case AMDGPU::S_XOR_B64: {
5188
+ // Performing an XOR operation on a uniform value
5189
+ // depends on the parity of the number of active lanes.
5190
+ // For even parity, the result will be 0, for odd
5191
+ // parity the result will be the same as the input value.
5192
+ Register ParityRegister =
5193
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5194
5194
5195
- auto ParityReg =
5196
5195
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5197
5196
.addReg(NewAccumulator->getOperand(0).getReg())
5198
5197
.addImm(1)
5199
5198
.setOperandDead(3); // Dead scc
5200
5199
if (is32BitOpc) {
5201
5200
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5202
5201
.addReg(SrcReg)
5203
- .addReg(ParityReg->getOperand(0).getReg() );
5202
+ .addReg(ParityRegister );
5204
5203
break;
5205
5204
} else {
5206
5205
Register DestSub0 =
@@ -5223,15 +5222,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5223
5222
5224
5223
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5225
5224
.add(Op1L)
5226
- .addReg(ParityReg->getOperand(0).getReg() );
5225
+ .addReg(ParityRegister );
5227
5226
5228
5227
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5229
5228
.add(Op1H)
5230
- .addReg(ParityReg->getOperand(0).getReg() );
5229
+ .addReg(ParityRegister );
5231
5230
5232
5231
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5233
5232
.add(Op1L)
5234
- .addReg(ParityReg->getOperand(0).getReg() );
5233
+ .addReg(ParityRegister );
5235
5234
5236
5235
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5237
5236
.addReg(CarryReg)
@@ -5250,12 +5249,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5250
5249
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5251
5250
5252
5251
// Take the negation of the source operand.
5253
- auto InvertedValReg =
5254
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5255
- .addImm(-1)
5256
- .addReg(SrcReg);
5252
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal)
5253
+ .addImm(-1)
5254
+ .addReg(SrcReg);
5257
5255
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5258
- .addReg(InvertedValReg->getOperand(0).getReg() )
5256
+ .addReg(NegatedVal )
5259
5257
.addReg(NewAccumulator->getOperand(0).getReg());
5260
5258
break;
5261
5259
}
@@ -5294,14 +5292,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5294
5292
.addReg(NewAccumulator->getOperand(0).getReg())
5295
5293
.addImm(-1);
5296
5294
5297
- MachineInstr *NegatedHi =
5298
5295
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5299
5296
.addReg(NegatedValLo)
5300
5297
.addImm(31)
5301
5298
.setOperandDead(3); // Dead scc
5302
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5303
- .add(Op1L)
5304
- .addReg(NegatedHi->getOperand(0).getReg() );
5299
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5300
+ .add(Op1L)
5301
+ .addReg(NegatedValHi );
5305
5302
}
5306
5303
Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5307
5304
? NegatedValLo
@@ -5374,17 +5371,15 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5374
5371
// Create initial values of induction variable from Exec, Accumulator and
5375
5372
// insert branch instr to newly created ComputeBlock
5376
5373
uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
5377
- auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator)
5378
- .addReg(ExecReg);
5374
+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5379
5375
if (is32BitOpc) {
5380
5376
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5381
5377
.addImm(IdentityValue);
5382
5378
} else {
5383
5379
Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5384
5380
Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5385
- MachineInstr *IdenHi =
5386
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5387
- .addImm(IdentityValue);
5381
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
5382
+ .addImm(IdentityValue);
5388
5383
switch (Opc) {
5389
5384
case AMDGPU::V_CMP_LT_U64_e64:
5390
5385
case AMDGPU::V_CMP_LT_I64_e64:
@@ -5395,14 +5390,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5395
5390
IdentityValue = int32_t(0); // u|max
5396
5391
break;
5397
5392
}
5398
- MachineInstr *IdenLo =
5399
5393
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
5400
5394
.addImm(IdentityValue);
5401
- BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
5402
- .addReg(IdenLo->getOperand(0).getReg())
5403
- .addImm(AMDGPU::sub0)
5404
- .addReg(IdenHi->getOperand(0).getReg())
5405
- .addImm(AMDGPU::sub1);
5395
+ BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5396
+ IdentityValReg)
5397
+ .addReg(Identitylo)
5398
+ .addImm(AMDGPU::sub0)
5399
+ .addReg(Identityhi)
5400
+ .addImm(AMDGPU::sub1);
5406
5401
}
5407
5402
// clang-format off
5408
5403
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
@@ -5417,24 +5412,23 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5417
5412
.addMBB(&BB);
5418
5413
auto ActiveBits =
5419
5414
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5420
- .addReg(TmpSReg->getOperand(0).getReg() )
5415
+ .addReg(LoopIterator )
5421
5416
.addMBB(&BB);
5422
5417
5423
5418
I = ComputeLoop->end();
5424
5419
MachineInstr *NewAccumulator;
5425
5420
// Perform the computations
5426
5421
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5427
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5428
- .addReg(ActiveBits->getOperand(0).getReg() );
5422
+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5423
+ .addReg(ActiveBitsReg );
5429
5424
if (is32BitOpc) {
5430
- MachineInstr *LaneValue =
5431
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5432
- LaneValueReg)
5433
- .addReg(SrcReg)
5434
- .addReg(FF1->getOperand(0).getReg());
5425
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5426
+ LaneValueReg)
5427
+ .addReg(SrcReg)
5428
+ .addReg(FF1Reg);
5435
5429
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5436
5430
.addReg(Accumulator->getOperand(0).getReg())
5437
- .addReg(LaneValue->getOperand(0).getReg() );
5431
+ .addReg(LaneValueReg );
5438
5432
} else {
5439
5433
Register LaneValueLoReg =
5440
5434
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5453,17 +5447,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5453
5447
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5454
5448
LaneValueLoReg)
5455
5449
.add(Op1L)
5456
- .addReg(FF1->getOperand(0).getReg() );
5450
+ .addReg(FF1Reg );
5457
5451
MachineInstr *LaneValueHi =
5458
5452
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5459
5453
LaneValueHiReg)
5460
5454
.add(Op1H)
5461
- .addReg(FF1->getOperand(0).getReg() );
5455
+ .addReg(FF1Reg );
5462
5456
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5463
5457
TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5464
- .addReg(LaneValueLo->getOperand(0).getReg() )
5458
+ .addReg(LaneValueLoReg )
5465
5459
.addImm(AMDGPU::sub0)
5466
- .addReg(LaneValueHi->getOperand(0).getReg() )
5460
+ .addReg(LaneValueHiReg )
5467
5461
.addImm(AMDGPU::sub1);
5468
5462
switch (Opc) {
5469
5463
case ::AMDGPU::S_OR_B64:
@@ -5500,14 +5494,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5500
5494
.addImm(AMDGPU::sub0)
5501
5495
.add(SrcReg0Sub1)
5502
5496
.addImm(AMDGPU::sub1);
5503
- auto LaneMask = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5504
- .addReg(LaneValue->getOperand(0).getReg())
5505
- .addReg(AccumulatorVReg);
5497
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5498
+ .addReg(LaneValue->getOperand(0).getReg())
5499
+ .addReg(AccumulatorVReg);
5506
5500
5507
5501
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5508
5502
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5509
- .addReg(LaneMask->getOperand(0).getReg() )
5510
- .addReg(ActiveBits->getOperand(0).getReg() );
5503
+ .addReg(LaneMaskReg )
5504
+ .addReg(ActiveBitsReg );
5511
5505
5512
5506
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5513
5507
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
@@ -5529,19 +5523,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5529
5523
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5530
5524
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5531
5525
&AMDGPU::SReg_32RegClass);
5532
- MachineInstr *DestLoComputation =
5533
- BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5534
- .add(Accumlo)
5535
- .addReg(LaneValueLo->getOperand(0).getReg());
5536
- MachineInstr *DestHiComputation =
5537
- BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5538
- .add(Accumhi)
5539
- .addReg(LaneValueHi->getOperand(0).getReg());
5526
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc1), DestLo)
5527
+ .add(Accumlo)
5528
+ .addReg(LaneValueLo->getOperand(0).getReg());
5529
+ BuildMI(*ComputeLoop, I, DL, TII->get(newOpc2), DestHi)
5530
+ .add(Accumhi)
5531
+ .addReg(LaneValueHi->getOperand(0).getReg());
5540
5532
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5541
5533
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5542
- .addReg(DestLoComputation->getOperand(0).getReg() )
5534
+ .addReg(DestLo )
5543
5535
.addImm(AMDGPU::sub0)
5544
- .addReg(DestHiComputation->getOperand(0).getReg() )
5536
+ .addReg(DestHi )
5545
5537
.addImm(AMDGPU::sub1);
5546
5538
break;
5547
5539
}
@@ -5550,21 +5542,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5550
5542
// Manipulate the iterator to get the next active lane
5551
5543
unsigned BITSETOpc =
5552
5544
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5553
- auto NewActiveBits =
5554
- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5555
- .addReg(FF1->getOperand(0).getReg())
5556
- .addReg(ActiveBits->getOperand(0).getReg());
5545
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5546
+ .addReg(FF1Reg)
5547
+ .addReg(ActiveBitsReg);
5557
5548
5558
5549
// Add phi nodes
5559
5550
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
5560
5551
.addMBB(ComputeLoop);
5561
- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
5562
- .addMBB(ComputeLoop);
5552
+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5563
5553
5564
5554
// Creating branching
5565
5555
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5566
5556
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5567
- .addReg(NewActiveBits->getOperand(0).getReg() )
5557
+ .addReg(NewActiveBitsReg )
5568
5558
.addImm(0);
5569
5559
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5570
5560
.addMBB(ComputeLoop);
0 commit comments