@@ -5351,55 +5351,55 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5351
5351
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5352
5352
.addReg(ExecMask);
5353
5353
5354
- switch (Opc) {
5355
- case AMDGPU::S_XOR_B32:
5356
- case AMDGPU::S_XOR_B64: {
5357
- // Performing an XOR operation on a uniform value
5358
- // depends on the parity of the number of active lanes.
5359
- // For even parity, the result will be 0, for odd
5360
- // parity the result will be the same as the input value.
5361
- Register ParityRegister =
5362
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363
-
5364
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5365
- .addReg(NewAccumulator->getOperand(0).getReg())
5366
- .addImm(1)
5367
- .setOperandDead(3); // Dead scc
5368
- if (is32BitOpc) {
5369
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5370
- .addReg(SrcReg)
5371
- .addReg(ParityRegister);
5372
- } else {
5373
- Register DestSub0 =
5374
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5375
- Register DestSub1 =
5376
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5377
-
5378
- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5379
- const TargetRegisterClass *SrcSubRC =
5380
- TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5381
-
5382
- MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5383
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5384
- MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5385
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5386
-
5387
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5388
- .add(Op1L)
5389
- .addReg(ParityRegister);
5390
-
5391
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5392
- .add(Op1H)
5393
- .addReg(ParityRegister);
5394
-
5395
- BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5396
- .addReg(DestSub0)
5397
- .addImm(AMDGPU::sub0)
5398
- .addReg(DestSub1)
5399
- .addImm(AMDGPU::sub1);
5400
- }
5401
- break;
5402
- }
5354
+ switch (Opc) {
5355
+ case AMDGPU::S_XOR_B32:
5356
+ case AMDGPU::S_XOR_B64: {
5357
+ // Performing an XOR operation on a uniform value
5358
+ // depends on the parity of the number of active lanes.
5359
+ // For even parity, the result will be 0, for odd
5360
+ // parity the result will be the same as the input value.
5361
+ Register ParityRegister =
5362
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5363
+
5364
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5365
+ .addReg(NewAccumulator->getOperand(0).getReg())
5366
+ .addImm(1)
5367
+ .setOperandDead(3); // Dead scc
5368
+ if (is32BitOpc) {
5369
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5370
+ .addReg(SrcReg)
5371
+ .addReg(ParityRegister);
5372
+ } else {
5373
+ Register DestSub0 =
5374
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5375
+ Register DestSub1 =
5376
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5377
+
5378
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5379
+ const TargetRegisterClass *SrcSubRC =
5380
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5381
+
5382
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5383
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5384
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5385
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5386
+
5387
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5388
+ .add(Op1L)
5389
+ .addReg(ParityRegister);
5390
+
5391
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5392
+ .add(Op1H)
5393
+ .addReg(ParityRegister);
5394
+
5395
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5396
+ .addReg(DestSub0)
5397
+ .addImm(AMDGPU::sub0)
5398
+ .addReg(DestSub1)
5399
+ .addImm(AMDGPU::sub1);
5400
+ }
5401
+ break;
5402
+ }
5403
5403
case AMDGPU::S_SUB_I32: {
5404
5404
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5405
5405
0 commit comments