@@ -5182,55 +5182,55 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5182
5182
BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5183
5183
.addReg(ExecMask);
5184
5184
5185
- switch (Opc) {
5186
- case AMDGPU::S_XOR_B32:
5187
- case AMDGPU::S_XOR_B64: {
5188
- // Performing an XOR operation on a uniform value
5189
- // depends on the parity of the number of active lanes.
5190
- // For even parity, the result will be 0, for odd
5191
- // parity the result will be the same as the input value.
5192
- Register ParityRegister =
5193
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5194
-
5195
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5196
- .addReg(NewAccumulator->getOperand(0).getReg())
5197
- .addImm(1)
5198
- .setOperandDead(3); // Dead scc
5199
- if (is32BitOpc) {
5200
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5201
- .addReg(SrcReg)
5202
- .addReg(ParityRegister);
5203
- } else {
5204
- Register DestSub0 =
5205
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5206
- Register DestSub1 =
5207
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5208
-
5209
- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5210
- const TargetRegisterClass *SrcSubRC =
5211
- TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5212
-
5213
- MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5214
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5215
- MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5216
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5217
-
5218
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5219
- .add(Op1L)
5220
- .addReg(ParityRegister);
5221
-
5222
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5223
- .add(Op1H)
5224
- .addReg(ParityRegister);
5225
-
5226
- BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5227
- .addReg(DestSub0)
5228
- .addImm(AMDGPU::sub0)
5229
- .addReg(DestSub1)
5230
- .addImm(AMDGPU::sub1);
5231
- }
5232
- break;
5233
- }
5185
+ switch (Opc) {
5186
+ case AMDGPU::S_XOR_B32:
5187
+ case AMDGPU::S_XOR_B64: {
5188
+ // Performing an XOR operation on a uniform value
5189
+ // depends on the parity of the number of active lanes.
5190
+ // For even parity, the result will be 0, for odd
5191
+ // parity the result will be the same as the input value.
5192
+ Register ParityRegister =
5193
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5194
+
5195
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5196
+ .addReg(NewAccumulator->getOperand(0).getReg())
5197
+ .addImm(1)
5198
+ .setOperandDead(3); // Dead scc
5199
+ if (is32BitOpc) {
5200
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5201
+ .addReg(SrcReg)
5202
+ .addReg(ParityRegister);
5203
+ } else {
5204
+ Register DestSub0 =
5205
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5206
+ Register DestSub1 =
5207
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5208
+
5209
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5210
+ const TargetRegisterClass *SrcSubRC =
5211
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5212
+
5213
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5214
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5215
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5216
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5217
+
5218
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5219
+ .add(Op1L)
5220
+ .addReg(ParityRegister);
5221
+
5222
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5223
+ .add(Op1H)
5224
+ .addReg(ParityRegister);
5225
+
5226
+ BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5227
+ .addReg(DestSub0)
5228
+ .addImm(AMDGPU::sub0)
5229
+ .addReg(DestSub1)
5230
+ .addImm(AMDGPU::sub1);
5231
+ }
5232
+ break;
5233
+ }
5234
5234
case AMDGPU::S_SUB_I32: {
5235
5235
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5236
5236
0 commit comments