@@ -5161,39 +5161,39 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5161
5161
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5162
5162
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5163
5163
Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5164
- Register ActiveLanes =
5164
+ Register NumActiveLanes =
5165
5165
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5166
5166
5167
5167
bool IsWave32 = ST.isWave32();
5168
5168
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5169
5169
MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5170
- unsigned CountReg =
5170
+ unsigned BitCountOpc =
5171
5171
IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5172
5172
5173
- BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5174
-
5175
- auto NewAccumulator =
5176
- BuildMI(BB, MI, DL, TII->get(CountReg ), ActiveLanes )
5177
- .addReg(ExecMask);
5178
-
5179
- switch (Opc) {
5180
- case AMDGPU::S_XOR_B32: {
5181
- // Performing an XOR operation on a uniform value
5182
- // depends on the parity of the number of active lanes.
5183
- // For even parity, the result will be 0, for odd
5184
- // parity the result will be the same as the input value.
5185
- Register ParityRegister =
5186
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5187
-
5188
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5189
- .addReg(NewAccumulator->getOperand(0).getReg())
5190
- .addImm(1)
5191
- .setOperandDead(3); // Dead scc
5192
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193
- .addReg(SrcReg)
5194
- .addReg(ParityRegister);
5195
- break;
5196
- }
5173
+ BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5174
+
5175
+ auto NewAccumulator =
5176
+ BuildMI(BB, MI, DL, TII->get(BitCountOpc ), NumActiveLanes )
5177
+ .addReg(ExecMask);
5178
+
5179
+ switch (Opc) {
5180
+ case AMDGPU::S_XOR_B32: {
5181
+ // Performing an XOR operation on a uniform value
5182
+ // depends on the parity of the number of active lanes.
5183
+ // For even parity, the result will be 0, for odd
5184
+ // parity the result will be the same as the input value.
5185
+ Register ParityRegister =
5186
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5187
+
5188
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5189
+ .addReg(NewAccumulator->getOperand(0).getReg())
5190
+ .addImm(1)
5191
+ .setOperandDead(3); // Dead scc
5192
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5193
+ .addReg(SrcReg)
5194
+ .addReg(ParityRegister);
5195
+ break;
5196
+ }
5197
5197
case AMDGPU::S_SUB_I32: {
5198
5198
Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5199
5199
@@ -5445,8 +5445,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5445
5445
.addReg(Accumulator->getOperand(0).getReg());
5446
5446
break;
5447
5447
}
5448
- case :: AMDGPU::S_ADD_U64_PSEUDO:
5449
- case :: AMDGPU::S_SUB_U64_PSEUDO: {
5448
+ case AMDGPU::S_ADD_U64_PSEUDO:
5449
+ case AMDGPU::S_SUB_U64_PSEUDO: {
5450
5450
unsigned newOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5451
5451
: AMDGPU::S_SUB_U32;
5452
5452
unsigned newOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
0 commit comments