@@ -5299,55 +5299,55 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5299
5299
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5300
5300
.addReg(ExecMask);
5301
5301
5302
- switch (Opc) {
5303
- case AMDGPU::S_XOR_B32:
5304
- case AMDGPU::S_XOR_B64: {
5305
- // Performing an XOR operation on a uniform value
5306
- // depends on the parity of the number of active lanes.
5307
- // For even parity, the result will be 0, for odd
5308
- // parity the result will be the same as the input value.
5309
- Register ParityRegister =
5310
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5311
-
5312
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5313
- .addReg(NewAccumulator->getOperand(0).getReg())
5314
- .addImm(1)
5315
- .setOperandDead(3); // Dead scc
5316
- if (is32BitOpc) {
5317
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5318
- .addReg(SrcReg)
5319
- .addReg(ParityRegister);
5320
- } else {
5321
- Register DestSub0 =
5322
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5323
- Register DestSub1 =
5324
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5325
-
5326
- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5327
- const TargetRegisterClass *SrcSubRC =
5328
- TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5329
-
5330
- MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5331
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5332
- MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5333
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5334
-
5335
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5336
- .add(Op1L)
5337
- .addReg(ParityRegister);
5338
-
5339
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5340
- .add(Op1H)
5341
- .addReg(ParityRegister);
5342
-
5343
- BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5344
- .addReg(DestSub0)
5345
- .addImm(AMDGPU::sub0)
5346
- .addReg(DestSub1)
5347
- .addImm(AMDGPU::sub1);
5348
- }
5349
- break;
5350
- }
5302
+ switch (Opc) {
5303
+ case AMDGPU::S_XOR_B32:
5304
+ case AMDGPU::S_XOR_B64: {
5305
+ // Performing an XOR operation on a uniform value
5306
+ // depends on the parity of the number of active lanes.
5307
+ // For even parity, the result will be 0, for odd
5308
+ // parity the result will be the same as the input value.
5309
+ Register ParityRegister =
5310
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5311
+
5312
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5313
+ .addReg(NewAccumulator->getOperand(0).getReg())
5314
+ .addImm(1)
5315
+ .setOperandDead(3); // Dead scc
5316
+ if (is32BitOpc) {
5317
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5318
+ .addReg(SrcReg)
5319
+ .addReg(ParityRegister);
5320
+ } else {
5321
+ Register DestSub0 =
5322
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5323
+ Register DestSub1 =
5324
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5325
+
5326
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5327
+ const TargetRegisterClass *SrcSubRC =
5328
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5329
+
5330
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5331
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5332
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5333
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5334
+
5335
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5336
+ .add(Op1L)
5337
+ .addReg(ParityRegister);
5338
+
5339
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5340
+ .add(Op1H)
5341
+ .addReg(ParityRegister);
5342
+
5343
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5344
+ .addReg(DestSub0)
5345
+ .addImm(AMDGPU::sub0)
5346
+ .addReg(DestSub1)
5347
+ .addImm(AMDGPU::sub1);
5348
+ }
5349
+ break;
5350
+ }
5351
5351
case AMDGPU::S_SUB_I32: {
5352
5352
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5353
5353
0 commit comments