@@ -5187,55 +5187,55 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5187
5187
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5188
5188
.addReg(ExecMask);
5189
5189
5190
- switch (Opc) {
5191
- case AMDGPU::S_XOR_B32:
5192
- case AMDGPU::S_XOR_B64: {
5193
- // Performing an XOR operation on a uniform value
5194
- // depends on the parity of the number of active lanes.
5195
- // For even parity, the result will be 0, for odd
5196
- // parity the result will be the same as the input value.
5197
- Register ParityRegister =
5198
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5199
-
5200
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5201
- .addReg(NewAccumulator->getOperand(0).getReg())
5202
- .addImm(1)
5203
- .setOperandDead(3); // Dead scc
5204
- if (is32BitOpc) {
5205
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5206
- .addReg(SrcReg)
5207
- .addReg(ParityRegister);
5208
- } else {
5209
- Register DestSub0 =
5210
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5211
- Register DestSub1 =
5212
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5213
-
5214
- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5215
- const TargetRegisterClass *SrcSubRC =
5216
- TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5217
-
5218
- MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5219
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5220
- MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5221
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5222
-
5223
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5224
- .add(Op1L)
5225
- .addReg(ParityRegister);
5226
-
5227
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5228
- .add(Op1H)
5229
- .addReg(ParityRegister);
5230
-
5231
- BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5232
- .addReg(DestSub0)
5233
- .addImm(AMDGPU::sub0)
5234
- .addReg(DestSub1)
5235
- .addImm(AMDGPU::sub1);
5236
- }
5237
- break;
5238
- }
5190
+ switch (Opc) {
5191
+ case AMDGPU::S_XOR_B32:
5192
+ case AMDGPU::S_XOR_B64: {
5193
+ // Performing an XOR operation on a uniform value
5194
+ // depends on the parity of the number of active lanes.
5195
+ // For even parity, the result will be 0, for odd
5196
+ // parity the result will be the same as the input value.
5197
+ Register ParityRegister =
5198
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5199
+
5200
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5201
+ .addReg(NewAccumulator->getOperand(0).getReg())
5202
+ .addImm(1)
5203
+ .setOperandDead(3); // Dead scc
5204
+ if (is32BitOpc) {
5205
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5206
+ .addReg(SrcReg)
5207
+ .addReg(ParityRegister);
5208
+ } else {
5209
+ Register DestSub0 =
5210
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5211
+ Register DestSub1 =
5212
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5213
+
5214
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5215
+ const TargetRegisterClass *SrcSubRC =
5216
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5217
+
5218
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5219
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5220
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5221
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5222
+
5223
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5224
+ .add(Op1L)
5225
+ .addReg(ParityRegister);
5226
+
5227
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5228
+ .add(Op1H)
5229
+ .addReg(ParityRegister);
5230
+
5231
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5232
+ .addReg(DestSub0)
5233
+ .addImm(AMDGPU::sub0)
5234
+ .addReg(DestSub1)
5235
+ .addImm(AMDGPU::sub1);
5236
+ }
5237
+ break;
5238
+ }
5239
5239
case AMDGPU::S_SUB_I32: {
5240
5240
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5241
5241
0 commit comments