diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 00c7f0eb6e9f1..c61f3a54ec2b9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -820,6 +820,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectSTACKRESTORE(N); return; } + case ISD::OR: { + if (SDNode *Selected = selectRotateOrFunnelShiftPattern(N)) { + ReplaceNode(N, Selected); + return; + } + break; + } } SelectCode(N); @@ -4105,6 +4112,168 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() { } while (IsModified); } +// Pattern matching for rotate/funnel shift operations +// and converts them to v_alignbit_b32 instructions +SDNode *AMDGPUDAGToDAGISel::selectRotateOrFunnelShiftPattern(SDNode *N) { + if (N->getOpcode() != ISD::OR) + return nullptr; + + // Only handle 32-bit operations + if (N->getValueType(0) != MVT::i32) + return nullptr; + + if (!N->isDivergent()) + return nullptr; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + SDNode *ShlNode = nullptr; + SDNode *SrlNode = nullptr; + + // Check both orderings: (shl, srl) and (srl, shl) + bool IsLHSShl = LHS.getOpcode() == ISD::SHL; + bool IsRHSSrl = RHS.getOpcode() == ISD::SRL; + bool IsLHSSrl = LHS.getOpcode() == ISD::SRL; + bool IsRHSShl = RHS.getOpcode() == ISD::SHL; + + if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) { + ShlNode = IsLHSShl ? LHS.getNode() : RHS.getNode(); + SrlNode = IsRHSSrl ? RHS.getNode() : LHS.getNode(); + } else { + return nullptr; + } + + // Extract sources and shift amounts + SDValue ShlSrc = ShlNode->getOperand(0); + SDValue ShlAmt = ShlNode->getOperand(1); + SDValue SrlSrc = SrlNode->getOperand(0); + SDValue SrlAmt = SrlNode->getOperand(1); + + // Handle the legalizer's (src << 1) pattern for SHL source + if (ShlSrc.getOpcode() == ISD::SHL) + if (ConstantSDNode *PreShlAmt = + dyn_cast(ShlSrc.getOperand(1))) + if (PreShlAmt->getZExtValue() == 1) + ShlSrc = ShlSrc.getOperand(0); + + // Helper function to build AlignBit instruction + auto buildAlignBitInstruction = [&](SDValue AlignBitSrc0, + SDValue AlignBitSrc1, + SDValue ShiftAmount) -> SDNode * { + SDLoc DL(N); + + // Select opcode based on subtarget features + const GCNSubtarget &ST = CurDAG->getSubtarget(); + unsigned Opcode = + ST.getGeneration() >= AMDGPUSubtarget::GFX11 + ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64 + : AMDGPU::V_ALIGNBIT_B32_fake16_e64) + : ST.hasTrue16BitInsts() + ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64 + : AMDGPU::V_ALIGNBIT_B32_fake16_e64) + : AMDGPU::V_ALIGNBIT_B32_e64; + + SDValue Ops[8]; // Maximum operands needed + unsigned NumOps = 0; + + if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 || + Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) { + // Extended format with modifiers + Ops[0] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src0_modifiers + Ops[1] = AlignBitSrc0; // src0 + Ops[2] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src1_modifiers + Ops[3] = AlignBitSrc1; // src1 + Ops[4] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src2_modifiers + Ops[5] = ShiftAmount; // src2 + Ops[6] = CurDAG->getTargetConstant(0, DL, MVT::i32); // clamp + Ops[7] = CurDAG->getTargetConstant(0, DL, MVT::i32); // op_sel + NumOps = 8; + } else { + // Regular e64 format + Ops[0] = AlignBitSrc0; + Ops[1] = AlignBitSrc1; + Ops[2] = ShiftAmount; + NumOps = 3; + } + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, + ArrayRef(Ops, NumOps)); + }; + + // Case 1: Both shift amounts are constants + ConstantSDNode *ShlConstant = dyn_cast(ShlAmt); + ConstantSDNode *SrlConstant = dyn_cast(SrlAmt); + + if (ShlConstant && SrlConstant) { + int64_t ShlVal = ShlConstant->getSExtValue(); + int64_t SrlVal = SrlConstant->getSExtValue(); + + if (ShlVal + SrlVal != 32) + return nullptr; + + // Create constant for shift amount + SDLoc DL(N); + SDValue ConstAmtNode = CurDAG->getTargetConstant(SrlVal, DL, MVT::i32); + + return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtNode); + } + + // Helper to extract shift amount from (some_value & 31) pattern + auto getShiftAmount = [&](SDValue ShiftAmtVal) -> SDValue { + if (ShiftAmtVal.getOpcode() == ISD::AND) + if (ConstantSDNode *MaskNode = + dyn_cast(ShiftAmtVal.getOperand(1))) + if (MaskNode->getZExtValue() == 31) + return ShiftAmtVal.getOperand(0); + + return SDValue(); + }; + + // Case 2: Variable shift amounts - check the AND pattern + SDValue ShlAmtSrc = getShiftAmount(ShlAmt); + SDValue SrlAmtSrc = getShiftAmount(SrlAmt); + + if (!ShlAmtSrc || !SrlAmtSrc) + return nullptr; + + // Check if SHL amount comes from NOT or NEG of the original amount + SDValue OriginalAmt; + bool IsRotatePattern = false; + + if (ShlAmtSrc.getOpcode() == ISD::XOR) { + // FSHR pattern: SHL amount = (~original_amt) & 31 + if (ConstantSDNode *XorMask = + dyn_cast(ShlAmtSrc.getOperand(1))) { + if (XorMask->getSExtValue() == -1) { + if (ShlAmtSrc.getOperand(0) == SrlAmtSrc) { + OriginalAmt = SrlAmtSrc; + IsRotatePattern = false; + } + } + } + } else if (ShlAmtSrc.getOpcode() == ISD::SUB) { + // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31 + if (ConstantSDNode *SubLHS = + dyn_cast(ShlAmtSrc.getOperand(0))) { + if (SubLHS->getZExtValue() == 0) { + if (ShlAmtSrc.getOperand(1) == SrlAmtSrc) { + OriginalAmt = SrlAmtSrc; + IsRotatePattern = true; + } + } + } + } + + if (!OriginalAmt) + return nullptr; + + SDValue AlignBitSrc0 = ShlSrc; + SDValue AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc; + + return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, OriginalAmt); +} + AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel) : SelectionDAGISelLegacy( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index acbab3d9e2d81..b73259054d581 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -284,6 +284,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { void SelectINTRINSIC_VOID(SDNode *N); void SelectWAVE_ADDRESS(SDNode *N); void SelectSTACKRESTORE(SDNode *N); + SDNode *selectRotateOrFunnelShiftPattern(SDNode *N); protected: // Include the pieces autogenerated from the target description. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b037cdd5393ea..49d122a91c7e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -486,12 +486,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal); } - // The hardware supports 32-bit FSHR, but not FSHL. - setOperationAction(ISD::FSHR, MVT::i32, Legal); + if (Subtarget->isGCN()) { + setOperationAction(ISD::FSHR, MVT::i32, Expand); + setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Expand); + } else { + setOperationAction(ISD::FSHR, MVT::i32, Legal); + setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Legal); + } // The hardware supports 32-bit ROTR, but not ROTL. setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8975486caa770..78506d8976f22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -406,6 +406,231 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } +bool AMDGPUInstructionSelector::selectRotateOrFunnelShiftPattern( + MachineInstr &I) const { + Register DstReg = I.getOperand(0).getReg(); + Register LHS = I.getOperand(1).getReg(); + Register RHS = I.getOperand(2).getReg(); + + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; + if (!IsVALU) + return false; + + // Check if this is a 32-bit operation + if (MRI->getType(DstReg).getSizeInBits() != 32) + return false; + + MachineInstr *LHSInst = getDefIgnoringCopies(LHS, *MRI); + MachineInstr *RHSInst = getDefIgnoringCopies(RHS, *MRI); + + MachineInstr *ShlInst = nullptr; + MachineInstr *SrlInst = nullptr; + + // Check both orderings: (shl, srl) and (srl, shl) + bool IsLHSShl = LHSInst->getOpcode() == TargetOpcode::G_SHL; + bool IsRHSSrl = RHSInst->getOpcode() == TargetOpcode::G_LSHR; + bool IsLHSSrl = LHSInst->getOpcode() == TargetOpcode::G_LSHR; + bool IsRHSShl = RHSInst->getOpcode() == TargetOpcode::G_SHL; + + if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) { + ShlInst = IsLHSShl ? LHSInst : RHSInst; + SrlInst = IsRHSSrl ? RHSInst : LHSInst; + } else + return false; + + // Extract the base sources, handling the legalizer's (src << 1) pattern + Register ShlSrc = ShlInst->getOperand(1).getReg(); + Register SrlSrc = SrlInst->getOperand(1).getReg(); + + // Check if SHL source comes from (original_src << 1) + MachineInstr *PreShlInst = getDefIgnoringCopies(ShlSrc, *MRI); + if (PreShlInst && PreShlInst->getOpcode() == TargetOpcode::G_SHL) { + std::optional PreShlAmt = getIConstantVRegValWithLookThrough( + PreShlInst->getOperand(2).getReg(), *MRI); + if (PreShlAmt && PreShlAmt->Value.getZExtValue() == 1) + ShlSrc = PreShlInst->getOperand(1).getReg(); + } + // Helper function to build AlignBit instruction + auto buildAlignBitInstruction = [&](Register AlignBitSrc0, + Register AlignBitSrc1, + Register ShiftAmount) -> bool { + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + // Select opcode based on subtarget features + unsigned Opcode = + STI.getGeneration() >= AMDGPUSubtarget::GFX11 + ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64 + : AMDGPU::V_ALIGNBIT_B32_fake16_e64) + : STI.hasTrue16BitInsts() + ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64 + : AMDGPU::V_ALIGNBIT_B32_fake16_e64) + : AMDGPU::V_ALIGNBIT_B32_e64; + + // Check constant bus restriction and copy SGPRs to VGPRs if needed + unsigned ConstantBusLimit = STI.getConstantBusLimit(Opcode); + unsigned SGPRCount = 0; + + Register AlignBitSrc0ToUse = AlignBitSrc0; + Register AlignBitSrc1ToUse = AlignBitSrc1; + Register ShiftAmountToUse = ShiftAmount; + + // Count SGPR operands + SGPRCount += (RBI.getRegBank(AlignBitSrc0, *MRI, TRI)->getID() == + AMDGPU::SGPRRegBankID) + ? 1 + : 0; + SGPRCount += (RBI.getRegBank(AlignBitSrc1, *MRI, TRI)->getID() == + AMDGPU::SGPRRegBankID) + ? 1 + : 0; + SGPRCount += (RBI.getRegBank(ShiftAmount, *MRI, TRI)->getID() == + AMDGPU::SGPRRegBankID) + ? 1 + : 0; + + // If we exceed the constant bus limit, copy SGPRs to VGPRs + if (SGPRCount > ConstantBusLimit) { + auto copyToVGPRIfNeeded = [&](Register &RegToUse, Register OrigReg) { + if (RBI.getRegBank(OrigReg, *MRI, TRI)->getID() == + AMDGPU::SGPRRegBankID && + SGPRCount > ConstantBusLimit) { + RegToUse = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_MOV_B32_e32), RegToUse) + .addReg(OrigReg); + SGPRCount--; + } + }; + + copyToVGPRIfNeeded(AlignBitSrc0ToUse, AlignBitSrc0); + copyToVGPRIfNeeded(AlignBitSrc1ToUse, AlignBitSrc1); + copyToVGPRIfNeeded(ShiftAmountToUse, ShiftAmount); + } + + auto AlignBit = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg); + + if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 || + Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) { + // t16/fake16 variants have extended operand format + AlignBit + .addImm(0) // src0_modifiers + .addReg(AlignBitSrc0ToUse) // src0 + .addImm(0) // src1_modifiers + .addReg(AlignBitSrc1ToUse) // src1 + .addImm(0) // src2_modifiers + .addReg(ShiftAmountToUse) // src2 + .addImm(0) // clamp + .addImm(0); // op_sel + } else { + AlignBit.addReg(AlignBitSrc0ToUse) + .addReg(AlignBitSrc1ToUse) + .addReg(ShiftAmountToUse); + } + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*AlignBit, TII, TRI, RBI); + }; + + // Get shift amounts for both SHL and SRL + Register ShlAmtReg = ShlInst->getOperand(2).getReg(); + Register SrlAmtReg = SrlInst->getOperand(2).getReg(); + + // Case 1: Both shift amounts are constants (may be through COPY instructions) + auto ShlConstVal = getIConstantVRegValWithLookThrough(ShlAmtReg, *MRI); + auto SrlConstVal = getIConstantVRegValWithLookThrough(SrlAmtReg, *MRI); + + if (ShlConstVal && SrlConstVal) { + int64_t ShlVal = ShlConstVal->Value.getSExtValue(); + int64_t SrlVal = SrlConstVal->Value.getSExtValue(); + + if (ShlVal + SrlVal != 32) + return false; + + // Create a constant register for the original shift amount (SRL amount) + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + Register ConstAmtReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), ConstAmtReg) + .addImm(SrlVal); + + return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtReg); + } + + // Helper to extract shift amount from (some_value & 31) pattern + auto getShiftAmount = [&](Register ShiftAmtReg) -> std::optional { + MachineInstr *AndInst = getDefIgnoringCopies(ShiftAmtReg, *MRI); + if (AndInst && AndInst->getOpcode() == TargetOpcode::G_AND) { + Register AndSrc = AndInst->getOperand(1).getReg(); + Register AndMask = AndInst->getOperand(2).getReg(); + + std::optional MaskVal = + getIConstantVRegValWithLookThrough(AndMask, *MRI); + if (MaskVal && MaskVal->Value.getZExtValue() == 31) { + return AndSrc; + } + } + return std::nullopt; + }; + + // Case 2: Variable shift amounts - check the AND/XOR pattern + auto ShlAmtSrc = getShiftAmount(ShlAmtReg); + auto SrlAmtSrc = getShiftAmount(SrlAmtReg); + + if (!ShlAmtSrc || !SrlAmtSrc) + return false; + + MachineInstr *ShlSrcInst = getDefIgnoringCopies(*ShlAmtSrc, *MRI); + if (!ShlSrcInst) + return false; + + Register OriginalAmt; + bool IsRotatePattern = false; + + if (ShlSrcInst->getOpcode() == TargetOpcode::G_XOR) { + // FSHR pattern: SHL amount = (~original_amt) & 31 + Register XorSrc = ShlSrcInst->getOperand(1).getReg(); + Register XorMask = ShlSrcInst->getOperand(2).getReg(); + + std::optional XorMaskVal = + getIConstantVRegValWithLookThrough(XorMask, *MRI); + if (!XorMaskVal || XorMaskVal->Value.getSExtValue() != -1) + return false; + + if (XorSrc != *SrlAmtSrc) + return false; + + OriginalAmt = *SrlAmtSrc; + IsRotatePattern = false; + + } else if (ShlSrcInst->getOpcode() == TargetOpcode::G_SUB) { + // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31 + Register SubLHS = ShlSrcInst->getOperand(1).getReg(); + Register SubRHS = ShlSrcInst->getOperand(2).getReg(); + + std::optional SubLHSVal = + getIConstantVRegValWithLookThrough(SubLHS, *MRI); + if (!SubLHSVal || SubLHSVal->Value.getZExtValue() != 0) + return false; + + if (SubRHS != *SrlAmtSrc) + return false; + + OriginalAmt = *SrlAmtSrc; + IsRotatePattern = true; + + } else + return false; + + // Build V_ALIGNBIT_B32 instruction + Register AlignBitSrc0 = ShlSrc; + Register AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc; + Register VarShiftAmount = OriginalAmt; + + return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, VarShiftAmount); +} + bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); @@ -4033,6 +4258,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_XOR: if (selectBITOP3(I)) return true; + if (I.getOpcode() == TargetOpcode::G_OR && selectRotateOrFunnelShiftPattern(I)) + return true; if (selectImpl(I, *CoverageInfo)) return true; return selectG_AND_OR_XOR(I); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 34bdf0a6d4ab2..46cdf813330b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -97,6 +97,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectG_FNEG(MachineInstr &I) const; bool selectG_FABS(MachineInstr &I) const; bool selectG_AND_OR_XOR(MachineInstr &I) const; + bool selectRotateOrFunnelShiftPattern(MachineInstr &I) const; bool selectG_ADD_SUB(MachineInstr &I) const; bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const; bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index e7bf88d2ee5b6..b1b19332d870c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2041,13 +2041,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, S32, S64) .lower(); - getActionDefinitionsBuilder({G_ROTR, G_ROTL}) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder({G_ROTR, G_ROTL}).scalarize(0).lower(); // TODO: Only Try to form v2s16 with legal packed instructions. getActionDefinitionsBuilder(G_FSHR) - .legalFor({{S32, S32}}) .lowerFor({{V2S16, V2S16}}) .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b54cccead9781..a280b84a4667b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4089,6 +4089,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_SMED3: case AMDGPU::G_AMDGPU_FMED3: return getDefaultMappingVOP(MI); + case AMDGPU::G_ROTR: + case AMDGPU::G_ROTL: { + if (isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + return getDefaultMappingVOP(MI); + } case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index fc81e16d68e98..3e65697c07450 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -1768,102 +1768,102 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) { ; GFX6-LABEL: s_fshl_v2i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: s_and_b32 s8, s0, 0xff -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_lshl_b32 s9, s9, 8 -; GFX6-NEXT: s_lshr_b32 s7, s1, 8 -; GFX6-NEXT: s_or_b32 s8, s8, s9 +; GFX6-NEXT: s_lshr_b32 s7, s0, 24 +; GFX6-NEXT: s_and_b32 s9, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX6-NEXT: s_lshl_b32 s0, s0, 8 +; GFX6-NEXT: s_or_b32 s0, s9, s0 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff +; GFX6-NEXT: s_lshr_b32 s8, s1, 8 +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_or_b32 s1, s7, s1 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s6 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 +; GFX6-NEXT: s_lshr_b32 s7, s2, 24 +; GFX6-NEXT: s_and_b32 s9, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX6-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NEXT: v_not_b32_e32 v1, 23 +; GFX6-NEXT: s_or_b32 s2, s9, s2 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX6-NEXT: s_lshr_b32 s8, s3, 8 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: s_and_b32 s0, s7, 0xff -; GFX6-NEXT: v_not_b32_e32 v3, 23 -; GFX6-NEXT: s_or_b32 s6, s8, s6 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 -; GFX6-NEXT: s_and_b32 s7, s2, 0xff -; GFX6-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NEXT: s_lshr_b32 s1, s3, 8 -; GFX6-NEXT: s_or_b32 s7, s7, s8 -; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_or_b32 s0, s7, s0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 -; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 -; GFX6-NEXT: s_lshr_b32 s1, s4, 16 -; GFX6-NEXT: s_and_b32 s3, s4, 0xff -; GFX6-NEXT: s_lshl_b32 s7, s7, 8 -; GFX6-NEXT: s_or_b32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 -; GFX6-NEXT: s_lshr_b32 s2, s5, 8 -; GFX6-NEXT: s_and_b32 s3, s5, 0xff -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: v_or_b32_e32 v5, s2, v5 -; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s1, v4 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: s_or_b32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s3, s3, 8 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff +; GFX6-NEXT: s_or_b32 s3, s7, s3 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s6 +; GFX6-NEXT: s_lshr_b32 s6, s4, 16 +; GFX6-NEXT: s_lshr_b32 s7, s4, 24 +; GFX6-NEXT: s_and_b32 s9, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_lshl_b32 s4, s4, 8 +; GFX6-NEXT: s_or_b32 s4, s9, s4 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s4, s4, s6 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX6-NEXT: s_lshr_b32 s8, s5, 8 +; GFX6-NEXT: s_and_b32 s5, s5, 0xff +; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff +; GFX6-NEXT: s_or_b32 s5, s7, s5 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s5, s5, s6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_lshl_b32_e32 v4, s6, v4 -; GFX6-NEXT: v_lshr_b32_e32 v6, s0, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8 +; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: s_lshr_b32 s0, s2, 1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX6-NEXT: v_lshr_b32_e32 v3, s0, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: s_lshr_b32 s0, s3, 1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX6-NEXT: v_lshl_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -2568,156 +2568,124 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { } define amdgpu_ps i32 @s_fshl_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { -; GFX6-LABEL: s_fshl_i32: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: s_not_b32 s1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i32: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_not_b32 s1, s2 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_not_b32 s1, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_not_b32 s3, s2 +; GCN-NEXT: s_lshr_b32 s1, s1, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, s2 +; GCN-NEXT: s_lshr_b32 s1, s1, s3 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } define amdgpu_ps i32 @s_fshl_i32_5(i32 inreg %lhs, i32 inreg %rhs) { -; GFX6-LABEL: s_fshl_i32_5: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 27 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i32_5: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 27 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i32_5: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 27 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i32_5: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 27 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i32_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 5 +; GCN-NEXT: s_lshr_b32 s1, s1, 27 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i32_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_lshr_b32 s1, s1, 27 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5) ret i32 %result } define amdgpu_ps i32 @s_fshl_i32_8(i32 inreg %lhs, i32 inreg %rhs) { -; GFX6-LABEL: s_fshl_i32_8: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 24 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshl_i32_8: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 24 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshl_i32_8: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 24 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshl_i32_8: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 24 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshl_i32_8: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_lshr_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i32_8: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 8 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8) ret i32 %result } define i32 @v_fshl_i32(i32 %lhs, i32 %rhs, i32 %amt) { -; GCN-LABEL: v_fshl_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GCN-NEXT: v_not_b32_e32 v2, v2 -; GCN-NEXT: v_alignbit_b32 v0, v0, v1, v2 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fshl_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 31, v2 +; GFX6-NEXT: v_not_b32_e32 v2, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 31, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 31, v2 +; GFX8-NEXT: v_not_b32_e32 v2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 31, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 31, v2 +; GFX9-NEXT: v_not_b32_e32 v2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 31, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_not_b32_e32 v3, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 31, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 31, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, v0, v1, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v2, v2 +; GFX11-NEXT: v_not_b32_e32 v3, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 31, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result @@ -2758,46 +2726,56 @@ define i32 @v_fshl_i32_8(i32 %lhs, i32 %rhs) { define amdgpu_ps float @v_fshl_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) { ; GFX6-LABEL: v_fshl_i32_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_alignbit_b32 v1, s0, v1, 1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_and_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_not_b32_e32 v0, v0 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 31, v0 +; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_lshr_b32 s0, s1, 1 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i32_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_alignbit_b32 v1, s0, v1, 1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: v_and_b32_e32 v1, 31, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 31, v0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: s_lshr_b32 s0, s1, 1 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i32_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 31, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 31, v0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i32_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v1, s0, s1, 1 -; GFX10-NEXT: v_not_b32_e32 v0, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX10-NEXT: v_not_b32_e32 v1, v0 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: v_and_b32_e32 v0, 31, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 31, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i32_ssv: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v1, s0, s1, 1 -; GFX11-NEXT: v_not_b32_e32 v0, v0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX11-NEXT: v_not_b32_e32 v1, v0 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 31, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 31, v1 +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float @@ -2807,46 +2785,48 @@ define amdgpu_ps float @v_fshl_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) define amdgpu_ps float @v_fshl_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) { ; GFX6-LABEL: v_fshl_i32_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_not_b32 s1, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: s_andn2_b32 s2, 31, s1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_lshl_b32 s0, s0, s1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i32_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s1, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: s_andn2_b32 s2, 31, s1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i32_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: s_andn2_b32 s2, 31, s1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i32_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_andn2_b32 s2, 31, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i32_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_and_not1_b32 s2, 31, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s2, v0 +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float @@ -2854,51 +2834,25 @@ define amdgpu_ps float @v_fshl_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) } define amdgpu_ps float @v_fshl_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { -; GFX6-LABEL: v_fshl_i32_vss: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: s_not_b32 s1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: v_fshl_i32_vss: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_not_b32 s1, s2 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: v_fshl_i32_vss: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_not_b32 s1, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: v_fshl_i32_vss: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: v_fshl_i32_vss: +; GCN: ; %bb.0: +; GCN-NEXT: s_not_b32 s3, s2 +; GCN-NEXT: s_lshr_b32 s1, s1, 1 +; GCN-NEXT: s_lshl_b32 s0, s0, s2 +; GCN-NEXT: s_lshr_b32 s1, s1, s3 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i32_vss: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float @@ -2909,67 +2863,92 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) { ; GFX6-LABEL: v_fshl_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 31, v4 ; GFX6-NEXT: v_not_b32_e32 v4, v4 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX6-NEXT: v_alignbit_b32 v2, v1, v3, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_not_b32_e32 v3, v5 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 31, v5 +; GFX6-NEXT: v_not_b32_e32 v4, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 31, v4 ; GFX8-NEXT: v_not_b32_e32 v4, v4 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX8-NEXT: v_alignbit_b32 v2, v1, v3, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_not_b32_e32 v3, v5 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 31, v5 +; GFX8-NEXT: v_not_b32_e32 v4, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 31, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v4 -; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX9-NEXT: v_alignbit_b32 v2, v1, v3, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_not_b32_e32 v3, v5 -; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_not_b32_e32 v4, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 31, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_not_b32_e32 v4, v4 -; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_not_b32_e32 v5, v5 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX10-NEXT: v_not_b32_e32 v6, v4 +; GFX10-NEXT: v_not_b32_e32 v7, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 31, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v4, v4 -; GFX11-NEXT: v_alignbit_b32 v3, v1, v3, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_not_b32_e32 v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-NEXT: v_not_b32_e32 v6, v4 +; GFX11-NEXT: v_not_b32_e32 v7, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 31, v7 +; GFX11-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) ret <2 x i32> %result @@ -2979,87 +2958,123 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) { ; GFX6-LABEL: v_fshl_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v3, v0, v3, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v9, 31, v6 ; GFX6-NEXT: v_not_b32_e32 v6, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v1, v4, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_not_b32_e32 v4, v7 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v5, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 31, v7 +; GFX6-NEXT: v_not_b32_e32 v6, v7 +; GFX6-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 31, v8 ; GFX6-NEXT: v_not_b32_e32 v4, v8 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_alignbit_b32 v3, v0, v3, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v9, 31, v6 ; GFX8-NEXT: v_not_b32_e32 v6, v6 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX8-NEXT: v_alignbit_b32 v3, v1, v4, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_not_b32_e32 v4, v7 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, v4 -; GFX8-NEXT: v_alignbit_b32 v3, v2, v5, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v9, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 31, v7 +; GFX8-NEXT: v_not_b32_e32 v6, v7 +; GFX8-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 31, v8 ; GFX8-NEXT: v_not_b32_e32 v4, v8 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v3, v0, v3, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v9, 31, v6 ; GFX9-NEXT: v_not_b32_e32 v6, v6 -; GFX9-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX9-NEXT: v_alignbit_b32 v3, v1, v4, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_not_b32_e32 v4, v7 -; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v4 -; GFX9-NEXT: v_alignbit_b32 v3, v2, v5, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX9-NEXT: v_not_b32_e32 v6, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v9, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 31, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, v6, v4 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, v3, v4 ; GFX9-NEXT: v_not_b32_e32 v4, v8 -; GFX9-NEXT: v_alignbit_b32 v2, v2, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX9-NEXT: v_and_b32_e32 v3, 31, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v3i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, v0, v3, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_not_b32_e32 v6, v6 -; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_not_b32_e32 v7, v7 -; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_not_b32_e32 v8, v8 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 -; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX10-NEXT: v_not_b32_e32 v9, v6 +; GFX10-NEXT: v_not_b32_e32 v10, v7 +; GFX10-NEXT: v_not_b32_e32 v11, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v9, 31, v9 +; GFX10-NEXT: v_and_b32_e32 v10, 31, v10 +; GFX10-NEXT: v_and_b32_e32 v11, 31, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v9, v3 +; GFX10-NEXT: v_and_b32_e32 v7, 31, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v10, v4 +; GFX10-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v11, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v7, v4 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, v8, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v3i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, v0, v3, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v6, v6 -; GFX11-NEXT: v_alignbit_b32 v4, v1, v4, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_not_b32_e32 v7, v7 -; GFX11-NEXT: v_alignbit_b32 v5, v2, v5, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_not_b32_e32 v8, v8 -; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX11-NEXT: v_not_b32_e32 v9, v6 +; GFX11-NEXT: v_not_b32_e32 v10, v7 +; GFX11-NEXT: v_not_b32_e32 v11, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX11-NEXT: v_and_b32_e32 v9, 31, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 31, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 31, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v9, v3 +; GFX11-NEXT: v_and_b32_e32 v7, 31, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, v10, v4 +; GFX11-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v11, v5 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v7, v4 +; GFX11-NEXT: v_lshl_or_b32 v2, v2, v8, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.fshl.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) ret <3 x i32> %result @@ -3069,107 +3084,155 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) { ; GFX6-LABEL: v_fshl_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v4, v0, v4, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v12, 31, v8 ; GFX6-NEXT: v_not_b32_e32 v8, v8 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX6-NEXT: v_alignbit_b32 v4, v1, v5, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_not_b32_e32 v5, v9 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v4, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v2, v6, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 31, v9 +; GFX6-NEXT: v_not_b32_e32 v8, v9 +; GFX6-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 31, v10 ; GFX6-NEXT: v_not_b32_e32 v5, v10 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v7, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 31, v11 ; GFX6-NEXT: v_not_b32_e32 v5, v11 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_alignbit_b32 v4, v0, v4, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v12, 31, v8 ; GFX8-NEXT: v_not_b32_e32 v8, v8 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX8-NEXT: v_alignbit_b32 v4, v1, v5, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_not_b32_e32 v5, v9 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v2, v6, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 31, v9 +; GFX8-NEXT: v_not_b32_e32 v8, v9 +; GFX8-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 31, v10 ; GFX8-NEXT: v_not_b32_e32 v5, v10 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v4, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v3, v7, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 31, v11 ; GFX8-NEXT: v_not_b32_e32 v5, v11 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_alignbit_b32 v4, v0, v4, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 31, v8 ; GFX9-NEXT: v_not_b32_e32 v8, v8 -; GFX9-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX9-NEXT: v_alignbit_b32 v4, v1, v5, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_not_b32_e32 v5, v9 -; GFX9-NEXT: v_alignbit_b32 v1, v1, v4, v5 -; GFX9-NEXT: v_alignbit_b32 v4, v2, v6, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX9-NEXT: v_not_b32_e32 v8, v9 +; GFX9-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v12, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 31, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, v8, v5 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, v4, v5 ; GFX9-NEXT: v_not_b32_e32 v5, v10 -; GFX9-NEXT: v_alignbit_b32 v2, v2, v4, v5 -; GFX9-NEXT: v_alignbit_b32 v4, v3, v7, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v6 +; GFX9-NEXT: v_and_b32_e32 v4, 31, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, v4, v5 ; GFX9-NEXT: v_not_b32_e32 v5, v11 -; GFX9-NEXT: v_alignbit_b32 v3, v3, v4, v5 +; GFX9-NEXT: v_and_b32_e32 v5, 31, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v7 +; GFX9-NEXT: v_and_b32_e32 v4, 31, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, v4, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_not_b32_e32 v8, v8 -; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_not_b32_e32 v9, v9 -; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_not_b32_e32 v10, v10 -; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_not_b32_e32 v11, v11 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 -; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 -; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX10-NEXT: v_not_b32_e32 v12, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX10-NEXT: v_not_b32_e32 v13, v9 +; GFX10-NEXT: v_not_b32_e32 v14, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v12, 31, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v13, 31, v13 +; GFX10-NEXT: v_and_b32_e32 v14, 31, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 1, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, v12, v4 +; GFX10-NEXT: v_not_b32_e32 v12, v10 +; GFX10-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX10-NEXT: v_and_b32_e32 v9, 31, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, v13, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 31, v10 +; GFX10-NEXT: v_and_b32_e32 v12, 31, v12 +; GFX10-NEXT: v_and_b32_e32 v11, 31, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, v14, v7 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v8, v4 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v9, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, v12, v6 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, v11, v7 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, v10, v6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshl_v4i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v4, v0, v4, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_not_b32_e32 v8, v8 -; GFX11-NEXT: v_alignbit_b32 v5, v1, v5, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_not_b32_e32 v9, v9 -; GFX11-NEXT: v_alignbit_b32 v6, v2, v6, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_not_b32_e32 v10, v10 -; GFX11-NEXT: v_alignbit_b32 v7, v3, v7, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX11-NEXT: v_not_b32_e32 v11, v11 -; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 -; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 -; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX11-NEXT: v_not_b32_e32 v12, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX11-NEXT: v_not_b32_e32 v13, v9 +; GFX11-NEXT: v_not_b32_e32 v14, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX11-NEXT: v_and_b32_e32 v12, 31, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v6 +; GFX11-NEXT: v_and_b32_e32 v13, 31, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 31, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 1, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, v12, v4 +; GFX11-NEXT: v_not_b32_e32 v12, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 31, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v13, v5 +; GFX11-NEXT: v_and_b32_e32 v10, 31, v10 +; GFX11-NEXT: v_and_b32_e32 v12, 31, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 31, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, v14, v7 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v8, v4 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v9, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, v12, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v3, v3, v11, v7 +; GFX11-NEXT: v_lshl_or_b32 v2, v2, v10, v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) ret <4 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 238cc06fc7f7c..e4a07748cf929 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -1783,102 +1783,102 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) { ; GFX6-LABEL: s_fshr_v2i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008 -; GFX6-NEXT: v_not_b32_e32 v3, 23 -; GFX6-NEXT: s_lshr_b32 s7, s1, 8 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_and_b32 s8, s0, 0xff -; GFX6-NEXT: s_lshl_b32 s9, s9, 8 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 -; GFX6-NEXT: s_or_b32 s8, s8, s9 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: s_lshr_b32 s1, s2, 16 -; GFX6-NEXT: s_and_b32 s9, s2, 0xff -; GFX6-NEXT: s_lshl_b32 s10, s10, 8 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: s_and_b32 s0, s7, 0xff -; GFX6-NEXT: s_lshr_b32 s7, s3, 8 -; GFX6-NEXT: s_or_b32 s9, s9, s10 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_lshr_b32 s7, s0, 24 +; GFX6-NEXT: s_and_b32 s9, s0, 0xff +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_lshl_b32 s0, s0, 8 +; GFX6-NEXT: s_or_b32 s0, s9, s0 +; GFX6-NEXT: s_lshr_b32 s9, s2, 16 +; GFX6-NEXT: s_lshr_b32 s10, s2, 24 +; GFX6-NEXT: s_and_b32 s12, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX6-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NEXT: v_not_b32_e32 v1, 23 +; GFX6-NEXT: s_or_b32 s2, s12, s2 +; GFX6-NEXT: s_and_b32 s9, s9, 0xff +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX6-NEXT: s_lshr_b32 s11, s3, 8 +; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX6-NEXT: s_lshl_b32 s9, s9, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GFX6-NEXT: s_and_b32 s2, s7, 0xff -; GFX6-NEXT: s_or_b32 s1, s9, s1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 -; GFX6-NEXT: v_or_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_lshr_b32 s2, s4, 16 -; GFX6-NEXT: s_and_b32 s7, s4, 0xff -; GFX6-NEXT: s_lshl_b32 s9, s9, 8 -; GFX6-NEXT: s_or_b32 s7, s7, s9 -; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s2, s7, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: s_lshr_b32 s3, s5, 8 +; GFX6-NEXT: s_or_b32 s2, s2, s9 +; GFX6-NEXT: s_lshl_b32 s3, s3, 8 +; GFX6-NEXT: s_and_b32 s9, s11, 0xff +; GFX6-NEXT: s_or_b32 s3, s10, s3 +; GFX6-NEXT: s_lshl_b32 s9, s9, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s9 +; GFX6-NEXT: s_lshr_b32 s9, s4, 16 +; GFX6-NEXT: s_lshr_b32 s10, s4, 24 +; GFX6-NEXT: s_and_b32 s12, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_lshl_b32 s4, s4, 8 +; GFX6-NEXT: s_or_b32 s4, s12, s4 +; GFX6-NEXT: s_and_b32 s9, s9, 0xff +; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX6-NEXT: s_lshl_b32 s9, s9, 16 +; GFX6-NEXT: s_or_b32 s4, s4, s9 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX6-NEXT: s_lshr_b32 s11, s5, 8 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_alignbit_b32 v5, s5, v5, 24 -; GFX6-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: v_or_b32_e32 v5, s3, v5 -; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3 +; GFX6-NEXT: s_and_b32 s9, s11, 0xff +; GFX6-NEXT: s_or_b32 s5, s10, s5 +; GFX6-NEXT: s_lshl_b32 s9, s9, 16 +; GFX6-NEXT: s_or_b32 s5, s5, s9 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3 -; GFX6-NEXT: s_lshl_b32 s2, s6, 17 -; GFX6-NEXT: s_lshl_b32 s3, s8, 1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff +; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_lshl_b32_e32 v6, s2, v6 -; GFX6-NEXT: v_lshr_b32_e32 v4, s1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 -; GFX6-NEXT: s_lshl_b32 s0, s0, 17 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_lshl_b32 s4, s6, 17 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s0, s4, s0 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8 +; GFX6-NEXT: v_lshl_b32_e32 v3, s0, v3 +; GFX6-NEXT: v_lshr_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 +; GFX6-NEXT: s_lshr_b32 s8, s1, 8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: s_and_b32 s8, s8, 0xff +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_lshl_b32 s1, s1, 9 +; GFX6-NEXT: s_lshl_b32 s2, s7, 1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 +; GFX6-NEXT: s_lshl_b32 s0, s8, 17 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_lshr_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -2592,117 +2592,86 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { define amdgpu_ps i32 @s_fshr_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { ; GFX6-LABEL: s_fshr_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_not_b32 s3, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s3 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_not_b32 s3, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_not_b32 s3, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_not_b32 s3, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } define amdgpu_ps i32 @s_fshr_i32_5(i32 inreg %lhs, i32 inreg %rhs) { -; GFX6-LABEL: s_fshr_i32_5: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 5 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshr_i32_5: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 5 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshr_i32_5: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 5 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshr_i32_5: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 5 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshr_i32_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 27 +; GCN-NEXT: s_lshr_b32 s1, s1, 5 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i32_5: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 27 +; GFX11-NEXT: s_lshr_b32 s1, s1, 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5) ret i32 %result } define amdgpu_ps i32 @s_fshr_i32_8(i32 inreg %lhs, i32 inreg %rhs) { -; GFX6-LABEL: s_fshr_i32_8: -; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 8 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: ; return to shader part epilog -; -; GFX8-LABEL: s_fshr_i32_8: -; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 8 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: s_fshr_i32_8: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 8 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: s_fshr_i32_8: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 8 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: s_fshr_i32_8: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_lshr_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i32_8: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_lshr_b32 s1, s1, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8) ret i32 %result @@ -2792,20 +2761,20 @@ define amdgpu_ps float @v_fshr_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) define amdgpu_ps float @v_fshr_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) { ; GFX6-LABEL: v_fshr_i32_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i32_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i32_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i32_svs: @@ -2825,36 +2794,53 @@ define amdgpu_ps float @v_fshr_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) define amdgpu_ps float @v_fshr_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { ; GFX6-LABEL: v_fshr_i32_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: s_not_b32 s3, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s3 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i32_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: s_not_b32 s3, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i32_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: s_not_b32 s3, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i32_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX10-NEXT: s_not_b32 s3, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i32_vss: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir deleted file mode 100644 index 0a4cb3ccf2957..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir +++ /dev/null @@ -1,41 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s - ---- - -name: fshr_s32 -legalized: true -regBankSelected: true - -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2 - - ; GCN-LABEL: name: fshr_s32 - ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]] - ; - ; GFX11-LABEL: name: fshr_s32 - ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[V_ALIGNBIT_B32_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_fake16_e64_]] - %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = COPY $vgpr1 - %2:vgpr(s32) = COPY $vgpr2 - %3:vgpr(s32) = G_FSHR %0, %1, %2 - S_ENDPGM 0, implicit %3 - -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir index 240036207bd0d..d3d5b243ca766 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir @@ -15,13 +15,17 @@ body: | ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32) - ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] - ; SI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) - ; SI-NEXT: $vgpr0 = COPY [[FSHR1]](s32) + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32) + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]] + ; SI-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; VI-LABEL: name: test_fshl_s32_s32 ; VI: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -29,13 +33,17 @@ body: | ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32) - ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] - ; VI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) - ; VI-NEXT: $vgpr0 = COPY [[FSHR1]](s32) + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32) + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32) + ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]] + ; VI-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX9-LABEL: name: test_fshl_s32_s32 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -43,13 +51,17 @@ body: | ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] - ; GFX9-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[FSHR1]](s32) + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32) + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]] + ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -72,17 +84,24 @@ body: | ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; SI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; SI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32) - ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]] ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] - ; SI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) - ; SI-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32) - ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32) + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]] + ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]] ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] - ; SI-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32) - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32) + ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]] + ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32) + ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) + ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[AND3]](s32) + ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR3]] + ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; VI-LABEL: name: test_fshl_v2s32_v2s32 @@ -94,17 +113,24 @@ body: | ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; VI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; VI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32) - ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]] ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] - ; VI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) - ; VI-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32) - ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32) + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32) + ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]] + ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]] ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] - ; VI-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32) - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32) + ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]] + ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32) + ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) + ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[AND3]](s32) + ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR3]] + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX9-LABEL: name: test_fshl_v2s32_v2s32 @@ -116,17 +142,24 @@ body: | ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]] ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] - ; GFX9-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) - ; GFX9-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND1]](s32) + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL]], [[LSHR1]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]] ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] - ; GFX9-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32) + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]] + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) + ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[AND3]](s32) + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR3]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir index 0a15cc3824ae7..56969c316e697 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir @@ -17,8 +17,17 @@ body: | ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32) - ; SI-NEXT: $vgpr0 = COPY [[FSHR]](s32) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32) + ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32) + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[AND]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]] + ; SI-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; VI-LABEL: name: test_fshr_s32_s32 ; VI: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -26,8 +35,17 @@ body: | ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32) - ; VI-NEXT: $vgpr0 = COPY [[FSHR]](s32) + ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32) + ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32) + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[AND]](s32) + ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]] + ; VI-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX9-LABEL: name: test_fshr_s32_s32 ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -35,8 +53,17 @@ body: | ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[FSHR]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[AND]](s32) + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]] + ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -59,9 +86,24 @@ body: | ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; SI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; SI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; SI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[UV4]](s32) - ; SI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s32) - ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]] + ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C2]](s32) + ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32) + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]] + ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]] + ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] + ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]] + ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[C2]](s32) + ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND3]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND2]](s32) + ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL3]], [[LSHR1]] + ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; VI-LABEL: name: test_fshr_v2s32_v2s32 @@ -73,9 +115,24 @@ body: | ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; VI-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; VI-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; VI-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[UV4]](s32) - ; VI-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s32) - ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32) + ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]] + ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] + ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C2]](s32) + ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32) + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND]](s32) + ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]] + ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]] + ; VI-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] + ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]] + ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[C2]](s32) + ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND3]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND2]](s32) + ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL3]], [[LSHR1]] + ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX9-LABEL: name: test_fshr_v2s32_v2s32 @@ -87,9 +144,24 @@ body: | ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; GFX9-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[UV4]](s32) - ; GFX9-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C2]](s32) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND]](s32) + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL1]], [[LSHR]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C]] + ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[C]] + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[C2]](s32) + ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND3]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND2]](s32) + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = disjoint G_OR [[SHL3]], [[LSHR1]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir index 7fdee12315754..f4b6727c82e99 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir @@ -180,9 +180,14 @@ body: | ; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]] - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32) - ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32) + ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32) + ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]] + ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[AND1]](s32) + ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR]] + ; GFX-NEXT: $sgpr0 = COPY [[OR]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = G_ROTL %0, %1(s32) @@ -300,15 +305,32 @@ body: | ; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]] - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32) + ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C1]] + ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND]](s32) + ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]] + ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[AND1]](s32) + ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR]] ; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]] - ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32) + ; GFX-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C1]] + ; GFX-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND2]](s32) + ; GFX-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]] + ; GFX-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[AND3]](s32) + ; GFX-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[LSHR1]] ; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]] - ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32) + ; GFX-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV6]], [[C1]] + ; GFX-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV2]], [[AND4]](s32) + ; GFX-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]] + ; GFX-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND5]](s32) + ; GFX-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[LSHR2]] ; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]] - ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32) - ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32) + ; GFX-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV7]], [[C1]] + ; GFX-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[UV3]], [[AND6]](s32) + ; GFX-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]] + ; GFX-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND7]](s32) + ; GFX-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[LSHR3]] + ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) ; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7 @@ -391,8 +413,15 @@ body: | ; GFX-NEXT: {{ $}} ; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32) - ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32) + ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]] + ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[AND]](s32) + ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]] + ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND1]](s32) + ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] + ; GFX-NEXT: $sgpr0 = COPY [[OR]](s32) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = G_ROTR %0, %1(s32) @@ -452,11 +481,33 @@ body: | ; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) - ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32) - ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32) - ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32) - ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32) - ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32) + ; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]] + ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C1]] + ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[AND]](s32) + ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]] + ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[AND1]](s32) + ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]] + ; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]] + ; GFX-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C1]] + ; GFX-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[AND2]](s32) + ; GFX-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]] + ; GFX-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[AND3]](s32) + ; GFX-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]] + ; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]] + ; GFX-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV6]], [[C1]] + ; GFX-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[AND4]](s32) + ; GFX-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]] + ; GFX-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV2]], [[AND5]](s32) + ; GFX-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]] + ; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]] + ; GFX-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV7]], [[C1]] + ; GFX-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[AND6]](s32) + ; GFX-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]] + ; GFX-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[UV3]], [[AND7]](s32) + ; GFX-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL3]] + ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) ; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir deleted file mode 100644 index b1a55fe7bc42f..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir +++ /dev/null @@ -1,168 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s - ---- -name: fshr_sss -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $sgpr1, $sgpr2 - ; CHECK-LABEL: name: fshr_sss - ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY4]], [[COPY5]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $sgpr1 - %2:_(s32) = COPY $sgpr2 - %3:_(s32) = G_FSHR %0, %1, %2 -... ---- -name: fshr_vss -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $sgpr0, $sgpr1 - ; CHECK-LABEL: name: fshr_vss - ; CHECK: liveins: $vgpr0, $sgpr0, $sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY3]], [[COPY4]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $sgpr0 - %2:_(s32) = COPY $sgpr1 - %3:_(s32) = G_FSHR %0, %1, %2 -... ---- -name: fshr_svs -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0, $sgpr1 - ; CHECK-LABEL: name: fshr_svs - ; CHECK: liveins: $sgpr0, $vgpr0, $sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY1]], [[COPY4]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $vgpr0 - %2:_(s32) = COPY $sgpr1 - %3:_(s32) = G_FSHR %0, %1, %2 -... ---- -name: fshr_ssv -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $sgpr1, $vgpr0 - ; CHECK-LABEL: name: fshr_ssv - ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY4]], [[COPY2]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $sgpr1 - %2:_(s32) = COPY $vgpr0 - %3:_(s32) = G_FSHR %0, %1, %2 -... ---- -name: fshr_vvs -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $sgpr0 - ; CHECK-LABEL: name: fshr_vvs - ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY3]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $vgpr1 - %2:_(s32) = COPY $sgpr0 - %3:_(s32) = G_FSHR %0, %1, %2 -... ---- -name: fshr_vsv -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $sgpr0, $vgpr1 - ; CHECK-LABEL: name: fshr_vsv - ; CHECK: liveins: $vgpr0, $sgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY3]], [[COPY2]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $sgpr1 - %2:_(s32) = COPY $vgpr1 - %3:_(s32) = G_FSHR %0, %1, %2 -... ---- -name: fshr_svv -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fshr_svv - ; CHECK: liveins: $sgpr0, $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY3]], [[COPY1]], [[COPY2]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $vgpr0 - %2:_(s32) = COPY $vgpr1 - %3:_(s32) = G_FSHR %0, %1, %2 -... ---- -name: fshr_vvv -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2 - ; CHECK-LABEL: name: fshr_vvv - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; CHECK-NEXT: [[FSHR:%[0-9]+]]:vgpr(s32) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $vgpr1 - %2:_(s32) = COPY $vgpr2 - %3:_(s32) = G_FSHR %0, %1, %2 -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index d9158e3558395..acd3dc683fa86 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -376,9 +376,9 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_min_u32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_lshr_b32 s0, s0, 24 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v2i8: @@ -726,18 +726,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 ; GFX6-NEXT: s_not_b32 s5, s3 -; GFX6-NEXT: s_min_u32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_min_u32 s4, s5, s4 +; GFX6-NEXT: s_lshr_b32 s0, s0, 24 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: s_lshl_b32 s0, s2, 16 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s3, 24 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v4i8: @@ -2142,9 +2142,9 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_min_u32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v2i16: @@ -2349,15 +2349,15 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_not_b32 s5, s3 ; GFX6-NEXT: s_min_u32 s4, s5, s4 -; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v4i16: @@ -2522,20 +2522,20 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_add_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_not_b32 s7, s5 -; GFX6-NEXT: s_min_u32 s6, s7, s6 -; GFX6-NEXT: s_add_i32 s5, s5, s6 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_min_u32 s6, s7, s6 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v6i16: @@ -2730,24 +2730,24 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_add_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_not_b32 s9, s7 -; GFX6-NEXT: s_min_u32 s8, s9, s8 -; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_min_u32 s8, s9, s8 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_add_i32 s7, s7, s8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 -; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v3 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_or_b32 s3, s6, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 1fd139b06417f..a7f3bc016948a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -368,9 +368,9 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_min_u32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_lshr_b32 s0, s0, 24 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v2i8: @@ -710,18 +710,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_min_u32 s4, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_min_u32 s4, s3, s4 +; GFX6-NEXT: s_lshr_b32 s0, s0, 24 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_lshr_b32 s3, s3, 24 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GFX6-NEXT: s_lshl_b32 s0, s2, 16 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s3, 24 -; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 24 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v4i8: @@ -2052,9 +2052,9 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_min_u32 s2, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v2i16: @@ -2247,15 +2247,15 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_min_u32 s4, s3, s4 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v4i16: @@ -2408,20 +2408,20 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_sub_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_min_u32 s6, s5, s6 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_min_u32 s6, s5, s6 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v6i16: @@ -2600,24 +2600,24 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_min_u32 s8, s7, s8 -; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_min_u32 s8, s7, s8 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 ; GFX6-NEXT: s_lshr_b32 s7, s7, 16 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 -; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16 -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v3 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 +; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 +; GFX6-NEXT: s_or_b32 s2, s4, s2 +; GFX6-NEXT: s_or_b32 s3, s6, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 0d5f538215f18..17924629fef84 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -22563,36 +22563,36 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -22668,118 +22668,118 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -22822,7 +22822,7 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 @@ -22831,62 +22831,62 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -22906,165 +22906,180 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: .LBB16_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23072,10 +23087,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23083,10 +23099,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23094,10 +23111,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23105,10 +23123,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23116,10 +23135,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23127,10 +23147,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23138,10 +23159,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23149,10 +23171,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23160,10 +23183,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23171,10 +23195,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23182,10 +23207,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23193,10 +23219,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23204,10 +23231,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23215,10 +23243,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23226,19 +23255,21 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -23247,10 +23278,11 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -23509,138 +23541,138 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v21, s4, 1 -; SI-NEXT: s_lshl_b32 s4, s17, 16 ; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v21, s4, 1 ; SI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; SI-NEXT: v_writelane_b32 v21, s4, 2 ; SI-NEXT: s_lshl_b32 s4, s16, 16 -; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s9, 16 -; SI-NEXT: s_and_b32 s13, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s8, 16 -; SI-NEXT: s_and_b32 s15, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s7, 16 -; SI-NEXT: s_and_b32 s41, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s6, 16 -; SI-NEXT: s_and_b32 s43, s99, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s99, 16 -; SI-NEXT: s_and_b32 s45, s98, 0xffff0000 -; SI-NEXT: s_lshl_b32 s44, s98, 16 -; SI-NEXT: s_and_b32 s47, s97, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s97, 16 -; SI-NEXT: s_and_b32 s57, s96, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s96, 16 -; SI-NEXT: s_and_b32 s59, s87, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s87, 16 -; SI-NEXT: s_and_b32 s61, s86, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s86, 16 -; SI-NEXT: s_and_b32 s63, s85, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s85, 16 -; SI-NEXT: s_and_b32 s73, s84, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s84, 16 -; SI-NEXT: s_and_b32 s75, s83, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s83, 16 -; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s82, 16 -; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s81, 16 -; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s80, 16 -; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s71, 16 -; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s70, 16 -; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s29, 16 -; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s28, 16 -; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s27, 16 -; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s26, 16 -; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s25, 16 -; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s24, 16 -; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s23, 16 -; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s22, 16 -; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s21, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s20, 16 -; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s19, 16 -; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s18, 16 +; SI-NEXT: s_and_b32 s10, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s9, 16 +; SI-NEXT: s_and_b32 s12, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s8, 16 +; SI-NEXT: s_and_b32 s14, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s7, 16 +; SI-NEXT: s_and_b32 s40, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s6, 16 +; SI-NEXT: s_and_b32 s42, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s99, 16 +; SI-NEXT: s_and_b32 s44, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s98, 16 +; SI-NEXT: s_and_b32 s46, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s97, 16 +; SI-NEXT: s_and_b32 s56, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s96, 16 +; SI-NEXT: s_and_b32 s58, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s87, 16 +; SI-NEXT: s_and_b32 s60, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s86, 16 +; SI-NEXT: s_and_b32 s62, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s85, 16 +; SI-NEXT: s_and_b32 s72, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s84, 16 +; SI-NEXT: s_and_b32 s74, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s83, 16 +; SI-NEXT: s_and_b32 s76, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s82, 16 +; SI-NEXT: s_and_b32 s78, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s81, 16 +; SI-NEXT: s_and_b32 s88, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s80, 16 +; SI-NEXT: s_and_b32 s90, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s71, 16 +; SI-NEXT: s_and_b32 s92, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s70, 16 +; SI-NEXT: s_and_b32 s94, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s29, 16 +; SI-NEXT: s_and_b32 s30, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s28, 16 +; SI-NEXT: s_and_b32 s34, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s27, 16 +; SI-NEXT: s_and_b32 s36, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s26, 16 +; SI-NEXT: s_and_b32 s38, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s25, 16 +; SI-NEXT: s_and_b32 s48, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s24, 16 +; SI-NEXT: s_and_b32 s50, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s23, 16 +; SI-NEXT: s_and_b32 s52, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s22, 16 +; SI-NEXT: s_and_b32 s54, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s21, 16 +; SI-NEXT: s_and_b32 s64, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s20, 16 +; SI-NEXT: s_and_b32 s66, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s19, 16 +; SI-NEXT: s_and_b32 s68, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s18, 16 ; SI-NEXT: v_writelane_b32 v21, s4, 3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB17_3 ; SI-NEXT: .LBB17_2: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -23677,8 +23709,11 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_mov_b32 s72, s74 ; SI-NEXT: s_mov_b32 s73, s75 ; SI-NEXT: s_mov_b32 s74, s76 -; SI-NEXT: v_readlane_b32 s75, v21, 0 -; SI-NEXT: v_readlane_b32 s76, v21, 1 +; SI-NEXT: s_mov_b32 s75, s77 +; SI-NEXT: s_mov_b32 s76, s78 +; SI-NEXT: s_mov_b32 s77, s79 +; SI-NEXT: v_readlane_b32 s78, v21, 0 +; SI-NEXT: v_readlane_b32 s79, v21, 1 ; SI-NEXT: s_cbranch_vccnz .LBB17_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -23713,296 +23748,328 @@ define inreg <64 x bfloat> @bitcast_v32i32_to_v64bf16_scalar(<32 x i32> inreg %a ; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s15, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s6, 16 +; SI-NEXT: s_and_b32 s14, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s6, 16 ; SI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; SI-NEXT: s_and_b32 s5, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s4, s9, 16 -; SI-NEXT: s_and_b32 s11, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s8, 16 -; SI-NEXT: s_and_b32 s13, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s7, 16 -; SI-NEXT: s_and_b32 s41, s99, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s99, 16 -; SI-NEXT: s_and_b32 s43, s98, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s98, 16 -; SI-NEXT: s_and_b32 s45, s97, 0xffff0000 -; SI-NEXT: s_lshl_b32 s44, s97, 16 -; SI-NEXT: s_and_b32 s47, s96, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s96, 16 -; SI-NEXT: s_and_b32 s57, s87, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s87, 16 -; SI-NEXT: s_and_b32 s59, s86, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s86, 16 -; SI-NEXT: s_and_b32 s61, s85, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s85, 16 -; SI-NEXT: s_and_b32 s63, s84, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s84, 16 -; SI-NEXT: s_and_b32 s73, s83, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s83, 16 -; SI-NEXT: s_and_b32 s77, s82, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s82, 16 -; SI-NEXT: s_and_b32 s79, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s81, 16 -; SI-NEXT: s_and_b32 s89, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s80, 16 -; SI-NEXT: s_and_b32 s91, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s71, 16 -; SI-NEXT: s_and_b32 s93, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s70, 16 -; SI-NEXT: s_and_b32 s95, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s29, 16 -; SI-NEXT: s_and_b32 s31, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s28, 16 -; SI-NEXT: s_and_b32 s35, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s27, 16 -; SI-NEXT: s_and_b32 s37, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s26, 16 -; SI-NEXT: s_and_b32 s39, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s25, 16 -; SI-NEXT: s_and_b32 s49, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s24, 16 -; SI-NEXT: s_and_b32 s51, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s23, 16 -; SI-NEXT: s_and_b32 s53, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s22, 16 -; SI-NEXT: s_and_b32 s55, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s21, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s20, 16 -; SI-NEXT: s_and_b32 s67, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s19, 16 -; SI-NEXT: s_and_b32 s69, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s18, 16 -; SI-NEXT: s_and_b32 s76, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s17, 16 +; SI-NEXT: s_and_b32 s4, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_and_b32 s10, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s8, 16 +; SI-NEXT: s_and_b32 s12, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s7, 16 +; SI-NEXT: s_and_b32 s40, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s99, 16 +; SI-NEXT: s_and_b32 s42, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s98, 16 +; SI-NEXT: s_and_b32 s44, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s97, 16 +; SI-NEXT: s_and_b32 s46, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s96, 16 +; SI-NEXT: s_and_b32 s56, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s87, 16 +; SI-NEXT: s_and_b32 s58, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s86, 16 +; SI-NEXT: s_and_b32 s60, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s85, 16 +; SI-NEXT: s_and_b32 s62, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s84, 16 +; SI-NEXT: s_and_b32 s72, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s83, 16 +; SI-NEXT: s_and_b32 s74, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s82, 16 +; SI-NEXT: s_and_b32 s76, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s81, 16 +; SI-NEXT: s_and_b32 s88, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s80, 16 +; SI-NEXT: s_and_b32 s90, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s71, 16 +; SI-NEXT: s_and_b32 s92, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s70, 16 +; SI-NEXT: s_and_b32 s94, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s29, 16 +; SI-NEXT: s_and_b32 s30, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s28, 16 +; SI-NEXT: s_and_b32 s34, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s27, 16 +; SI-NEXT: s_and_b32 s36, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s26, 16 +; SI-NEXT: s_and_b32 s38, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s25, 16 +; SI-NEXT: s_and_b32 s48, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s24, 16 +; SI-NEXT: s_and_b32 s50, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s23, 16 +; SI-NEXT: s_and_b32 s52, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s22, 16 +; SI-NEXT: s_and_b32 s54, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s21, 16 +; SI-NEXT: s_and_b32 s64, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s20, 16 +; SI-NEXT: s_and_b32 s66, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s19, 16 +; SI-NEXT: s_and_b32 s68, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s18, 16 +; SI-NEXT: s_and_b32 s78, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s17, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v21, s6, 2 ; SI-NEXT: s_lshl_b32 s6, s16, 16 ; SI-NEXT: v_writelane_b32 v21, s6, 3 ; SI-NEXT: .LBB17_5: ; %end -; SI-NEXT: v_readlane_b32 s6, v21, 2 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_readlane_b32 s6, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v21, 2 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v20, 35 @@ -24310,213 +24377,224 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -24542,132 +24620,173 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -24713,263 +24832,315 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: .LBB18_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB18_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB18_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -25003,12 +25174,12 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB18_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -25020,14 +25191,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -25038,14 +25209,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -25056,14 +25227,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -25074,14 +25245,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -25092,14 +25263,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -25110,14 +25281,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -25128,14 +25299,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -25146,14 +25317,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -25164,14 +25335,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -25182,14 +25353,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -25200,14 +25371,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -25218,14 +25389,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -25236,14 +25407,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -25254,14 +25425,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -25272,14 +25443,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -25290,15 +25461,15 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -25309,14 +25480,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -25327,14 +25498,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -25345,14 +25516,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -25363,14 +25534,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -25381,14 +25552,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -25399,14 +25570,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -25417,14 +25588,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -25435,14 +25606,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -25453,14 +25624,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -25471,14 +25642,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -25489,14 +25660,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -25507,14 +25678,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -25525,14 +25696,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 @@ -25543,14 +25714,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -25561,14 +25732,14 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -25579,8 +25750,8 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB18_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -27212,6 +27383,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -27228,533 +27400,621 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v22, v26 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB19_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -27772,41 +28032,28 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v43, v23 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_branch .LBB19_2 ; @@ -27851,12 +28098,12 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB19_3 ; VI-NEXT: .LBB19_2: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -27867,14 +28114,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -27885,14 +28132,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -27903,14 +28150,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -27921,14 +28168,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -27939,14 +28186,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -27957,14 +28204,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -27975,14 +28222,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -27993,14 +28240,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -28011,14 +28258,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -28029,14 +28276,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -28047,14 +28294,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -28065,14 +28312,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -28083,14 +28330,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -28101,14 +28348,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -28119,14 +28366,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -28137,14 +28384,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -28155,14 +28402,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -28173,14 +28420,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -28191,14 +28438,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -28209,14 +28456,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -28227,14 +28474,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -28245,14 +28492,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -28263,14 +28510,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -28281,14 +28528,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -28299,14 +28546,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -28317,14 +28564,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -28335,14 +28582,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -28353,14 +28600,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -28371,14 +28618,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -28389,14 +28636,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -28407,14 +28654,14 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -28425,8 +28672,8 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB19_3: ; %end ; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -59360,36 +59607,36 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -59465,118 +59712,118 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -59619,7 +59866,7 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 @@ -59628,62 +59875,62 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -59703,165 +59950,180 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: .LBB40_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59869,10 +60131,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59880,10 +60143,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59891,10 +60155,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59902,10 +60167,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59913,10 +60179,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59924,10 +60191,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59935,10 +60203,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59946,10 +60215,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59957,10 +60227,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59968,10 +60239,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59979,10 +60251,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -59990,10 +60263,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -60001,10 +60275,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -60012,10 +60287,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -60023,19 +60299,21 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -60044,10 +60322,11 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -60225,8 +60504,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v63, s30, 0 @@ -60372,109 +60651,107 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: s_lshl_b32 s59, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s47, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s47, 1.0 ; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s46, 1.0 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 ; SI-NEXT: v_add_f32_e64 v3, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v45, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v43, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v41, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v55, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v53, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v51, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v49, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v39, s28, 1.0 -; SI-NEXT: v_add_f32_e64 v37, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v35, s6, 1.0 -; SI-NEXT: v_add_f32_e64 v33, s7, 1.0 -; SI-NEXT: v_add_f32_e64 v31, s8, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s9, 1.0 -; SI-NEXT: v_add_f32_e64 v27, s10, 1.0 -; SI-NEXT: v_add_f32_e64 v25, s11, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s12, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s13, 1.0 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 -; SI-NEXT: v_add_f32_e64 v13, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v11, s42, 1.0 -; SI-NEXT: v_add_f32_e64 v9, s43, 1.0 -; SI-NEXT: v_add_f32_e64 v7, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v5, s45, 1.0 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v4, s46, 1.0 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v46, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v44, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v42, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v40, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v54, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v50, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v48, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v38, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v36, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v32, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s12, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s40, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s42, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s43, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s44, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s45, 1.0 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s16, 1.0 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: v_add_f32_e64 v2, s16, 1.0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v55 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v43 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v45 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v50 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v54 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v44 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_branch .LBB41_5 ; SI-NEXT: .LBB41_3: ; SI-NEXT: ; implicit-def: $sgpr4 @@ -60547,10 +60824,12 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: s_branch .LBB41_2 ; SI-NEXT: .LBB41_4: +; SI-NEXT: v_mov_b32_e32 v4, s85 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 2 @@ -60558,281 +60837,314 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: v_mov_b32_e32 v2, s59 -; SI-NEXT: v_mov_b32_e32 v3, s58 -; SI-NEXT: v_mov_b32_e32 v61, s57 -; SI-NEXT: v_mov_b32_e32 v1, s56 -; SI-NEXT: v_mov_b32_e32 v59, s99 -; SI-NEXT: v_mov_b32_e32 v60, s98 -; SI-NEXT: v_mov_b32_e32 v57, s97 -; SI-NEXT: v_mov_b32_e32 v58, s96 -; SI-NEXT: v_mov_b32_e32 v47, s87 -; SI-NEXT: v_mov_b32_e32 v56, s86 -; SI-NEXT: v_mov_b32_e32 v45, s85 -; SI-NEXT: v_mov_b32_e32 v46, s84 -; SI-NEXT: v_mov_b32_e32 v43, s83 -; SI-NEXT: v_mov_b32_e32 v44, s82 -; SI-NEXT: v_mov_b32_e32 v41, s81 -; SI-NEXT: v_mov_b32_e32 v42, s80 -; SI-NEXT: v_mov_b32_e32 v55, s71 -; SI-NEXT: v_mov_b32_e32 v40, s70 -; SI-NEXT: v_mov_b32_e32 v53, s69 -; SI-NEXT: v_mov_b32_e32 v54, s68 -; SI-NEXT: v_mov_b32_e32 v51, s67 -; SI-NEXT: v_mov_b32_e32 v52, s66 -; SI-NEXT: v_mov_b32_e32 v49, s65 -; SI-NEXT: v_mov_b32_e32 v50, s64 -; SI-NEXT: v_mov_b32_e32 v39, s55 -; SI-NEXT: v_mov_b32_e32 v48, s54 -; SI-NEXT: v_mov_b32_e32 v37, s53 -; SI-NEXT: v_mov_b32_e32 v38, s52 -; SI-NEXT: v_mov_b32_e32 v35, s51 -; SI-NEXT: v_mov_b32_e32 v36, s50 -; SI-NEXT: v_mov_b32_e32 v33, s49 -; SI-NEXT: v_mov_b32_e32 v34, s48 -; SI-NEXT: v_mov_b32_e32 v31, s39 -; SI-NEXT: v_mov_b32_e32 v32, s38 -; SI-NEXT: v_mov_b32_e32 v29, s37 -; SI-NEXT: v_mov_b32_e32 v30, s36 -; SI-NEXT: v_mov_b32_e32 v27, s35 -; SI-NEXT: v_mov_b32_e32 v28, s34 -; SI-NEXT: v_mov_b32_e32 v25, s31 -; SI-NEXT: v_mov_b32_e32 v26, s30 -; SI-NEXT: v_mov_b32_e32 v23, s95 -; SI-NEXT: v_mov_b32_e32 v24, s94 -; SI-NEXT: v_mov_b32_e32 v21, s93 -; SI-NEXT: v_mov_b32_e32 v22, s92 -; SI-NEXT: v_mov_b32_e32 v19, s91 -; SI-NEXT: v_mov_b32_e32 v20, s90 -; SI-NEXT: v_mov_b32_e32 v17, s89 -; SI-NEXT: v_mov_b32_e32 v18, s88 -; SI-NEXT: v_mov_b32_e32 v15, s79 -; SI-NEXT: v_mov_b32_e32 v16, s78 -; SI-NEXT: v_mov_b32_e32 v13, s77 -; SI-NEXT: v_mov_b32_e32 v14, s76 -; SI-NEXT: v_mov_b32_e32 v11, s75 -; SI-NEXT: v_mov_b32_e32 v12, s74 -; SI-NEXT: v_mov_b32_e32 v9, s73 -; SI-NEXT: v_mov_b32_e32 v10, s72 -; SI-NEXT: v_mov_b32_e32 v7, s63 -; SI-NEXT: v_mov_b32_e32 v8, s62 -; SI-NEXT: v_mov_b32_e32 v5, s61 -; SI-NEXT: v_mov_b32_e32 v6, s60 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v3, s59 +; SI-NEXT: v_mov_b32_e32 v2, s58 +; SI-NEXT: v_mov_b32_e32 v1, s57 +; SI-NEXT: v_mov_b32_e32 v61, s56 +; SI-NEXT: v_mov_b32_e32 v60, s99 +; SI-NEXT: v_mov_b32_e32 v59, s98 +; SI-NEXT: v_mov_b32_e32 v58, s97 +; SI-NEXT: v_mov_b32_e32 v57, s96 +; SI-NEXT: v_mov_b32_e32 v56, s87 +; SI-NEXT: v_mov_b32_e32 v47, s86 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: v_mov_b32_e32 v5, s60 +; SI-NEXT: v_mov_b32_e32 v6, s61 +; SI-NEXT: v_mov_b32_e32 v7, s62 +; SI-NEXT: v_mov_b32_e32 v8, s63 +; SI-NEXT: v_mov_b32_e32 v9, s72 +; SI-NEXT: v_mov_b32_e32 v10, s73 +; SI-NEXT: v_mov_b32_e32 v11, s74 +; SI-NEXT: v_mov_b32_e32 v12, s75 +; SI-NEXT: v_mov_b32_e32 v13, s76 +; SI-NEXT: v_mov_b32_e32 v14, s77 +; SI-NEXT: v_mov_b32_e32 v15, s78 +; SI-NEXT: v_mov_b32_e32 v16, s79 +; SI-NEXT: v_mov_b32_e32 v17, s88 +; SI-NEXT: v_mov_b32_e32 v18, s89 +; SI-NEXT: v_mov_b32_e32 v19, s90 +; SI-NEXT: v_mov_b32_e32 v20, s91 +; SI-NEXT: v_mov_b32_e32 v21, s92 +; SI-NEXT: v_mov_b32_e32 v22, s93 +; SI-NEXT: v_mov_b32_e32 v23, s94 +; SI-NEXT: v_mov_b32_e32 v24, s95 +; SI-NEXT: v_mov_b32_e32 v25, s30 +; SI-NEXT: v_mov_b32_e32 v26, s31 +; SI-NEXT: v_mov_b32_e32 v27, s34 +; SI-NEXT: v_mov_b32_e32 v28, s35 +; SI-NEXT: v_mov_b32_e32 v29, s36 +; SI-NEXT: v_mov_b32_e32 v30, s37 +; SI-NEXT: v_mov_b32_e32 v31, s38 +; SI-NEXT: v_mov_b32_e32 v32, s39 +; SI-NEXT: v_mov_b32_e32 v33, s48 +; SI-NEXT: v_mov_b32_e32 v34, s49 +; SI-NEXT: v_mov_b32_e32 v35, s50 +; SI-NEXT: v_mov_b32_e32 v36, s51 +; SI-NEXT: v_mov_b32_e32 v37, s52 +; SI-NEXT: v_mov_b32_e32 v38, s53 +; SI-NEXT: v_mov_b32_e32 v39, s54 +; SI-NEXT: v_mov_b32_e32 v48, s55 +; SI-NEXT: v_mov_b32_e32 v49, s64 +; SI-NEXT: v_mov_b32_e32 v50, s65 +; SI-NEXT: v_mov_b32_e32 v51, s66 +; SI-NEXT: v_mov_b32_e32 v52, s67 +; SI-NEXT: v_mov_b32_e32 v53, s68 +; SI-NEXT: v_mov_b32_e32 v54, s69 +; SI-NEXT: v_mov_b32_e32 v55, s70 +; SI-NEXT: v_mov_b32_e32 v40, s71 +; SI-NEXT: v_mov_b32_e32 v41, s80 +; SI-NEXT: v_mov_b32_e32 v42, s81 +; SI-NEXT: v_mov_b32_e32 v43, s82 +; SI-NEXT: v_mov_b32_e32 v44, s83 +; SI-NEXT: v_mov_b32_e32 v45, s84 ; SI-NEXT: .LBB41_5: ; %end ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -60869,22 +61181,23 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -60901,8 +61214,8 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -61153,213 +61466,224 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -61385,132 +61709,173 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -61556,263 +61921,315 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: .LBB42_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB42_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB42_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -61846,12 +62263,12 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB42_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -61863,14 +62280,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -61881,14 +62298,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -61899,14 +62316,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -61917,14 +62334,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -61935,14 +62352,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -61953,14 +62370,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -61971,14 +62388,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -61989,14 +62406,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -62007,14 +62424,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -62025,14 +62442,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -62043,14 +62460,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -62061,14 +62478,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -62079,14 +62496,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -62097,14 +62514,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -62115,14 +62532,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -62133,15 +62550,15 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -62152,14 +62569,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -62170,14 +62587,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -62188,14 +62605,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -62206,14 +62623,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -62224,14 +62641,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -62242,14 +62659,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -62260,14 +62677,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -62278,14 +62695,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -62296,14 +62713,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -62314,14 +62731,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -62332,14 +62749,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -62350,14 +62767,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -62368,14 +62785,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 @@ -62386,14 +62803,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -62404,14 +62821,14 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -62422,8 +62839,8 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB42_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -64055,6 +64472,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -64071,533 +64489,621 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v22, v26 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB43_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -64615,41 +65121,28 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v43, v23 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_branch .LBB43_2 ; @@ -64694,12 +65187,12 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB43_3 ; VI-NEXT: .LBB43_2: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -64710,14 +65203,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -64728,14 +65221,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -64746,14 +65239,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -64764,14 +65257,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -64782,14 +65275,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -64800,14 +65293,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -64818,14 +65311,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -64836,14 +65329,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -64854,14 +65347,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -64872,14 +65365,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -64890,14 +65383,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -64908,14 +65401,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -64926,14 +65419,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -64944,14 +65437,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -64962,14 +65455,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -64980,14 +65473,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -64998,14 +65491,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -65016,14 +65509,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -65034,14 +65527,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -65052,14 +65545,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -65070,14 +65563,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -65088,14 +65581,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -65106,14 +65599,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -65124,14 +65617,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -65142,14 +65635,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -65160,14 +65653,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -65178,14 +65671,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -65196,14 +65689,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -65214,14 +65707,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -65232,14 +65725,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -65250,14 +65743,14 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -65268,8 +65761,8 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB43_3: ; %end ; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -94210,36 +94703,36 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -94315,118 +94808,118 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v62 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v63 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -94499,220 +94992,235 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v14 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v11 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v9 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v8 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v7 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v5 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v4 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v2 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v1 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v12 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v11 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v10 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v2 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94720,10 +95228,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94731,10 +95240,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94742,10 +95252,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94753,10 +95264,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94764,10 +95276,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94775,10 +95288,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94786,10 +95300,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94797,10 +95312,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94808,10 +95324,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94819,10 +95336,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94830,10 +95348,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94841,10 +95360,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94852,10 +95372,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94863,10 +95384,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94874,19 +95396,21 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -94895,10 +95419,11 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -95172,66 +95697,66 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: v_writelane_b32 v21, s4, 2 ; SI-NEXT: s_lshl_b32 s4, s8, 16 ; SI-NEXT: v_writelane_b32 v21, s4, 3 -; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s7, 16 -; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s6, 16 -; SI-NEXT: s_and_b32 s15, s99, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s99, 16 -; SI-NEXT: s_and_b32 s41, s98, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s98, 16 -; SI-NEXT: s_and_b32 s43, s97, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s97, 16 -; SI-NEXT: s_and_b32 s45, s96, 0xffff0000 -; SI-NEXT: s_lshl_b32 s44, s96, 16 -; SI-NEXT: s_and_b32 s47, s87, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s87, 16 -; SI-NEXT: s_and_b32 s57, s86, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s86, 16 -; SI-NEXT: s_and_b32 s59, s85, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s85, 16 -; SI-NEXT: s_and_b32 s61, s84, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s84, 16 -; SI-NEXT: s_and_b32 s63, s83, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s83, 16 -; SI-NEXT: s_and_b32 s73, s82, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s82, 16 -; SI-NEXT: s_and_b32 s75, s81, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s81, 16 -; SI-NEXT: s_and_b32 s77, s80, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s80, 16 -; SI-NEXT: s_and_b32 s79, s71, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s71, 16 -; SI-NEXT: s_and_b32 s89, s70, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s70, 16 -; SI-NEXT: s_and_b32 s91, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s29, 16 -; SI-NEXT: s_and_b32 s93, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s28, 16 -; SI-NEXT: s_and_b32 s95, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s27, 16 -; SI-NEXT: s_and_b32 s31, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s26, 16 -; SI-NEXT: s_and_b32 s35, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s25, 16 -; SI-NEXT: s_and_b32 s37, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s24, 16 -; SI-NEXT: s_and_b32 s39, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s23, 16 -; SI-NEXT: s_and_b32 s49, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s22, 16 -; SI-NEXT: s_and_b32 s51, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s21, 16 -; SI-NEXT: s_and_b32 s53, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s20, 16 -; SI-NEXT: s_and_b32 s55, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s19, 16 -; SI-NEXT: s_and_b32 s65, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s18, 16 -; SI-NEXT: s_and_b32 s67, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s17, 16 -; SI-NEXT: s_and_b32 s69, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s16, 16 +; SI-NEXT: s_and_b32 s10, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s7, 16 +; SI-NEXT: s_and_b32 s12, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s6, 16 +; SI-NEXT: s_and_b32 s14, s99, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s99, 16 +; SI-NEXT: s_and_b32 s40, s98, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s98, 16 +; SI-NEXT: s_and_b32 s42, s97, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s97, 16 +; SI-NEXT: s_and_b32 s44, s96, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s96, 16 +; SI-NEXT: s_and_b32 s46, s87, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s87, 16 +; SI-NEXT: s_and_b32 s56, s86, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s86, 16 +; SI-NEXT: s_and_b32 s58, s85, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s85, 16 +; SI-NEXT: s_and_b32 s60, s84, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s84, 16 +; SI-NEXT: s_and_b32 s62, s83, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s83, 16 +; SI-NEXT: s_and_b32 s72, s82, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s82, 16 +; SI-NEXT: s_and_b32 s74, s81, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s81, 16 +; SI-NEXT: s_and_b32 s76, s80, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s80, 16 +; SI-NEXT: s_and_b32 s78, s71, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s71, 16 +; SI-NEXT: s_and_b32 s88, s70, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s70, 16 +; SI-NEXT: s_and_b32 s90, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s29, 16 +; SI-NEXT: s_and_b32 s92, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s28, 16 +; SI-NEXT: s_and_b32 s94, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s27, 16 +; SI-NEXT: s_and_b32 s30, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s26, 16 +; SI-NEXT: s_and_b32 s34, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s25, 16 +; SI-NEXT: s_and_b32 s36, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s24, 16 +; SI-NEXT: s_and_b32 s38, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s23, 16 +; SI-NEXT: s_and_b32 s48, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s22, 16 +; SI-NEXT: s_and_b32 s50, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s21, 16 +; SI-NEXT: s_and_b32 s52, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s20, 16 +; SI-NEXT: s_and_b32 s54, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s19, 16 +; SI-NEXT: s_and_b32 s64, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s18, 16 +; SI-NEXT: s_and_b32 s66, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s17, 16 +; SI-NEXT: s_and_b32 s68, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s16, 16 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true ; SI-NEXT: s_add_u32 s4, s16, 3 @@ -95250,18 +95775,18 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: s_addc_u32 s27, s29, 0 ; SI-NEXT: s_add_u32 s28, s70, 3 ; SI-NEXT: s_addc_u32 s29, s71, 0 -; SI-NEXT: s_add_u32 s76, s80, 3 -; SI-NEXT: s_addc_u32 s74, s81, 0 -; SI-NEXT: s_add_u32 s72, s82, 3 -; SI-NEXT: s_addc_u32 s62, s83, 0 -; SI-NEXT: s_add_u32 s60, s84, 3 -; SI-NEXT: s_addc_u32 s58, s85, 0 -; SI-NEXT: s_add_u32 s56, s86, 3 -; SI-NEXT: s_addc_u32 s46, s87, 0 -; SI-NEXT: s_add_u32 s44, s96, 3 -; SI-NEXT: s_addc_u32 s42, s97, 0 -; SI-NEXT: s_add_u32 s40, s98, 3 -; SI-NEXT: s_addc_u32 s14, s99, 0 +; SI-NEXT: s_add_u32 s77, s80, 3 +; SI-NEXT: s_addc_u32 s75, s81, 0 +; SI-NEXT: s_add_u32 s73, s82, 3 +; SI-NEXT: s_addc_u32 s63, s83, 0 +; SI-NEXT: s_add_u32 s61, s84, 3 +; SI-NEXT: s_addc_u32 s59, s85, 0 +; SI-NEXT: s_add_u32 s57, s86, 3 +; SI-NEXT: s_addc_u32 s47, s87, 0 +; SI-NEXT: s_add_u32 s45, s96, 3 +; SI-NEXT: s_addc_u32 s43, s97, 0 +; SI-NEXT: s_add_u32 s41, s98, 3 +; SI-NEXT: s_addc_u32 s15, s99, 0 ; SI-NEXT: s_add_u32 s6, s6, 3 ; SI-NEXT: s_addc_u32 s7, s7, 0 ; SI-NEXT: s_add_u32 s8, s8, 3 @@ -95274,292 +95799,324 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: s_and_b32 s9, s8, 0xffff0000 ; SI-NEXT: v_writelane_b32 v21, s9, 2 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s11, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s10, s7, 16 -; SI-NEXT: s_and_b32 s13, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s12, s6, 16 -; SI-NEXT: s_and_b32 s15, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_and_b32 s41, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_and_b32 s43, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_and_b32 s45, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s44, s44, 16 -; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_and_b32 s57, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s56, s56, 16 -; SI-NEXT: s_and_b32 s59, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s58, s58, 16 -; SI-NEXT: s_and_b32 s61, s60, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s60, 16 -; SI-NEXT: s_and_b32 s63, s62, 0xffff0000 -; SI-NEXT: s_lshl_b32 s62, s62, 16 -; SI-NEXT: s_and_b32 s73, s72, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s72, 16 -; SI-NEXT: s_and_b32 s75, s74, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s74, 16 -; SI-NEXT: s_and_b32 s77, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s76, 16 -; SI-NEXT: s_and_b32 s79, s29, 0xffff0000 -; SI-NEXT: s_lshl_b32 s78, s29, 16 -; SI-NEXT: s_and_b32 s89, s28, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s28, 16 -; SI-NEXT: s_and_b32 s91, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s27, 16 -; SI-NEXT: s_and_b32 s93, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s92, s26, 16 -; SI-NEXT: s_and_b32 s95, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s94, s25, 16 -; SI-NEXT: s_and_b32 s31, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s30, s24, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s34, s23, 16 -; SI-NEXT: s_and_b32 s37, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s36, s22, 16 -; SI-NEXT: s_and_b32 s39, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s38, s21, 16 -; SI-NEXT: s_and_b32 s49, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s48, s20, 16 -; SI-NEXT: s_and_b32 s51, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s19, 16 -; SI-NEXT: s_and_b32 s53, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s52, s18, 16 -; SI-NEXT: s_and_b32 s55, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s54, s17, 16 -; SI-NEXT: s_and_b32 s65, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s16, 16 -; SI-NEXT: s_and_b32 s67, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s66, s5, 16 -; SI-NEXT: s_and_b32 s69, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s68, s4, 16 +; SI-NEXT: s_and_b32 s10, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s11, s7, 16 +; SI-NEXT: s_and_b32 s12, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s13, s6, 16 +; SI-NEXT: s_and_b32 s14, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_and_b32 s40, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: s_and_b32 s42, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: s_and_b32 s44, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s45, s45, 16 +; SI-NEXT: s_and_b32 s46, s47, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s47, 16 +; SI-NEXT: s_and_b32 s56, s57, 0xffff0000 +; SI-NEXT: s_lshl_b32 s57, s57, 16 +; SI-NEXT: s_and_b32 s58, s59, 0xffff0000 +; SI-NEXT: s_lshl_b32 s59, s59, 16 +; SI-NEXT: s_and_b32 s60, s61, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s61, 16 +; SI-NEXT: s_and_b32 s62, s63, 0xffff0000 +; SI-NEXT: s_lshl_b32 s63, s63, 16 +; SI-NEXT: s_and_b32 s72, s73, 0xffff0000 +; SI-NEXT: s_lshl_b32 s73, s73, 16 +; SI-NEXT: s_and_b32 s74, s75, 0xffff0000 +; SI-NEXT: s_lshl_b32 s75, s75, 16 +; SI-NEXT: s_and_b32 s76, s77, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_and_b32 s78, s29, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s29, 16 +; SI-NEXT: s_and_b32 s88, s28, 0xffff0000 +; SI-NEXT: s_lshl_b32 s89, s28, 16 +; SI-NEXT: s_and_b32 s90, s27, 0xffff0000 +; SI-NEXT: s_lshl_b32 s91, s27, 16 +; SI-NEXT: s_and_b32 s92, s26, 0xffff0000 +; SI-NEXT: s_lshl_b32 s93, s26, 16 +; SI-NEXT: s_and_b32 s94, s25, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s25, 16 +; SI-NEXT: s_and_b32 s30, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s31, s24, 16 +; SI-NEXT: s_and_b32 s34, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s23, 16 +; SI-NEXT: s_and_b32 s36, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s22, 16 +; SI-NEXT: s_and_b32 s38, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s39, s21, 16 +; SI-NEXT: s_and_b32 s48, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s20, 16 +; SI-NEXT: s_and_b32 s50, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s19, 16 +; SI-NEXT: s_and_b32 s52, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s18, 16 +; SI-NEXT: s_and_b32 s54, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s55, s17, 16 +; SI-NEXT: s_and_b32 s64, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s65, s16, 16 +; SI-NEXT: s_and_b32 s66, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s67, s5, 16 +; SI-NEXT: s_and_b32 s68, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s69, s4, 16 ; SI-NEXT: v_writelane_b32 v21, s8, 3 ; SI-NEXT: .LBB61_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s68 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s94 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s4, v21, 2 +; SI-NEXT: v_readlane_b32 s4, v21, 3 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v21, 3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_readlane_b32 s4, v21, 2 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: v_readlane_b32 s4, v21, 0 +; SI-NEXT: v_readlane_b32 s4, v21, 1 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v21, 1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_readlane_b32 s4, v21, 0 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v20, 35 @@ -95607,66 +96164,66 @@ define inreg <64 x bfloat> @bitcast_v16i64_to_v64bf16_scalar(<16 x i64> inreg %a ; SI-NEXT: .LBB61_4: ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -95945,213 +96502,224 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -96177,132 +96745,171 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -96348,263 +96955,317 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: .LBB62_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB62_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB62_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -96638,12 +97299,12 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB62_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -96655,14 +97316,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -96673,14 +97334,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -96691,14 +97352,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -96709,14 +97370,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -96727,14 +97388,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -96745,14 +97406,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -96763,14 +97424,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -96781,14 +97442,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -96799,14 +97460,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -96817,14 +97478,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -96835,14 +97496,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -96853,14 +97514,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -96871,14 +97532,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -96889,14 +97550,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -96907,14 +97568,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -96925,15 +97586,15 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -96944,14 +97605,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -96962,14 +97623,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -96980,14 +97641,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -96998,14 +97659,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -97016,14 +97677,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -97034,14 +97695,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -97052,14 +97713,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -97070,14 +97731,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -97088,14 +97749,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -97106,14 +97767,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -97124,14 +97785,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -97142,14 +97803,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -97160,14 +97821,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 @@ -97178,14 +97839,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -97196,14 +97857,14 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -97214,8 +97875,8 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -98847,6 +99508,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -98863,533 +99525,621 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB63_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v22, v26 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_cbranch_execnz .LBB63_3 ; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB63_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -99407,41 +100157,28 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB63_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v43, v23 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_branch .LBB63_2 ; @@ -99486,12 +100223,12 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB63_3 ; VI-NEXT: .LBB63_2: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -99502,14 +100239,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -99520,14 +100257,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -99538,14 +100275,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -99556,14 +100293,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -99574,14 +100311,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -99592,14 +100329,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -99610,14 +100347,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -99628,14 +100365,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -99646,14 +100383,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -99664,14 +100401,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -99682,14 +100419,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -99700,14 +100437,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -99718,14 +100455,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -99736,14 +100473,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -99754,14 +100491,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -99772,14 +100509,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -99790,14 +100527,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -99808,14 +100545,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -99826,14 +100563,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -99844,14 +100581,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -99862,14 +100599,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -99880,14 +100617,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -99898,14 +100635,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -99916,14 +100653,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -99934,14 +100671,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -99952,14 +100689,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -99970,14 +100707,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -99988,14 +100725,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -100006,14 +100743,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -100024,14 +100761,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -100042,14 +100779,14 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -100060,8 +100797,8 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB63_3: ; %end ; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -129053,34 +129790,34 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -129090,119 +129827,119 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 @@ -129226,63 +129963,61 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v32 ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 @@ -129292,170 +130027,186 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v15 -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v13 -; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v9 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v8 -; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v7 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 -; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v3 -; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v13 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v11 +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 +; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v7 +; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v3 +; SI-NEXT: v_and_b32_e32 v62, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: .LBB76_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v63 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v62 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129463,10 +130214,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129474,10 +130226,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129485,10 +130238,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129496,10 +130250,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129507,10 +130262,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129518,10 +130274,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129529,10 +130286,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129540,10 +130298,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129551,10 +130310,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129562,10 +130322,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129573,10 +130334,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129584,10 +130346,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129595,10 +130358,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129606,10 +130370,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129617,10 +130382,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -129628,10 +130394,11 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload @@ -129641,7 +130408,8 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -129934,94 +130702,93 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[14:15], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 ; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 -; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_add_f64 v[40:41], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[52:53], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[48:49], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[36:37], s[44:45], 1.0 +; SI-NEXT: v_add_f64 v[32:33], s[42:43], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[16:17], s[10:11], 1.0 +; SI-NEXT: v_add_f64 v[12:13], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[4:5], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v8 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v12 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v16 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v20 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v20 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v33 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v53 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v53 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v41 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v3 +; SI-NEXT: v_add_f64 v[60:61], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 ; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v61 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v61 ; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 -; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 -; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v13 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v17 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v17 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v52 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v40 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 -; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_and_b32_e32 v59, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_branch .LBB77_5 @@ -130096,236 +130863,243 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s71 +; SI-NEXT: v_mov_b32_e32 v1, s67 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s69 +; SI-NEXT: v_mov_b32_e32 v1, s66 ; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s68 +; SI-NEXT: v_mov_b32_e32 v1, s65 ; SI-NEXT: v_mov_b32_e32 v61, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 2 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 3 -; SI-NEXT: v_mov_b32_e32 v5, s59 -; SI-NEXT: v_mov_b32_e32 v4, s58 -; SI-NEXT: v_mov_b32_e32 v9, s57 -; SI-NEXT: v_mov_b32_e32 v6, s56 -; SI-NEXT: v_mov_b32_e32 v13, s99 -; SI-NEXT: v_mov_b32_e32 v10, s98 -; SI-NEXT: v_mov_b32_e32 v17, s97 -; SI-NEXT: v_mov_b32_e32 v14, s96 -; SI-NEXT: v_mov_b32_e32 v21, s87 -; SI-NEXT: v_mov_b32_e32 v18, s86 -; SI-NEXT: v_mov_b32_e32 v25, s85 -; SI-NEXT: v_mov_b32_e32 v22, s84 -; SI-NEXT: v_mov_b32_e32 v29, s83 -; SI-NEXT: v_mov_b32_e32 v26, s82 -; SI-NEXT: v_mov_b32_e32 v33, s81 -; SI-NEXT: v_mov_b32_e32 v30, s80 -; SI-NEXT: v_mov_b32_e32 v34, s70 -; SI-NEXT: v_mov_b32_e32 v8, s67 -; SI-NEXT: v_mov_b32_e32 v7, s66 -; SI-NEXT: v_mov_b32_e32 v24, s65 -; SI-NEXT: v_mov_b32_e32 v23, s64 -; SI-NEXT: v_mov_b32_e32 v16, s55 -; SI-NEXT: v_mov_b32_e32 v15, s54 -; SI-NEXT: v_mov_b32_e32 v28, s53 -; SI-NEXT: v_mov_b32_e32 v27, s52 -; SI-NEXT: v_mov_b32_e32 v12, s51 -; SI-NEXT: v_mov_b32_e32 v11, s50 -; SI-NEXT: v_mov_b32_e32 v32, s49 -; SI-NEXT: v_mov_b32_e32 v31, s48 -; SI-NEXT: v_mov_b32_e32 v20, s39 -; SI-NEXT: v_mov_b32_e32 v19, s38 -; SI-NEXT: v_mov_b32_e32 v36, s37 -; SI-NEXT: v_mov_b32_e32 v35, s36 -; SI-NEXT: v_mov_b32_e32 v38, s35 -; SI-NEXT: v_mov_b32_e32 v37, s34 -; SI-NEXT: v_mov_b32_e32 v48, s31 -; SI-NEXT: v_mov_b32_e32 v39, s30 -; SI-NEXT: v_mov_b32_e32 v50, s95 -; SI-NEXT: v_mov_b32_e32 v49, s94 -; SI-NEXT: v_mov_b32_e32 v52, s93 -; SI-NEXT: v_mov_b32_e32 v51, s92 -; SI-NEXT: v_mov_b32_e32 v54, s91 -; SI-NEXT: v_mov_b32_e32 v53, s90 -; SI-NEXT: v_mov_b32_e32 v40, s89 -; SI-NEXT: v_mov_b32_e32 v55, s88 -; SI-NEXT: v_mov_b32_e32 v42, s79 -; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v4, s59 +; SI-NEXT: v_mov_b32_e32 v5, s58 +; SI-NEXT: v_mov_b32_e32 v6, s57 +; SI-NEXT: v_mov_b32_e32 v7, s56 +; SI-NEXT: v_mov_b32_e32 v10, s99 +; SI-NEXT: v_mov_b32_e32 v11, s98 +; SI-NEXT: v_mov_b32_e32 v14, s97 +; SI-NEXT: v_mov_b32_e32 v15, s96 +; SI-NEXT: v_mov_b32_e32 v18, s87 +; SI-NEXT: v_mov_b32_e32 v19, s86 +; SI-NEXT: v_mov_b32_e32 v22, s85 +; SI-NEXT: v_mov_b32_e32 v23, s84 +; SI-NEXT: v_mov_b32_e32 v26, s83 +; SI-NEXT: v_mov_b32_e32 v27, s82 +; SI-NEXT: v_mov_b32_e32 v30, s81 +; SI-NEXT: v_mov_b32_e32 v31, s80 +; SI-NEXT: v_mov_b32_e32 v34, s71 +; SI-NEXT: v_mov_b32_e32 v35, s70 +; SI-NEXT: v_mov_b32_e32 v38, s69 +; SI-NEXT: v_mov_b32_e32 v39, s68 +; SI-NEXT: v_mov_b32_e32 v24, s64 +; SI-NEXT: v_mov_b32_e32 v25, s55 +; SI-NEXT: v_mov_b32_e32 v16, s54 +; SI-NEXT: v_mov_b32_e32 v17, s53 +; SI-NEXT: v_mov_b32_e32 v28, s52 +; SI-NEXT: v_mov_b32_e32 v29, s51 +; SI-NEXT: v_mov_b32_e32 v12, s50 +; SI-NEXT: v_mov_b32_e32 v13, s49 +; SI-NEXT: v_mov_b32_e32 v32, s48 +; SI-NEXT: v_mov_b32_e32 v33, s39 +; SI-NEXT: v_mov_b32_e32 v20, s38 +; SI-NEXT: v_mov_b32_e32 v21, s37 +; SI-NEXT: v_mov_b32_e32 v36, s36 +; SI-NEXT: v_mov_b32_e32 v37, s35 +; SI-NEXT: v_mov_b32_e32 v8, s34 +; SI-NEXT: v_mov_b32_e32 v9, s31 +; SI-NEXT: v_mov_b32_e32 v48, s30 +; SI-NEXT: v_mov_b32_e32 v49, s95 +; SI-NEXT: v_mov_b32_e32 v50, s94 +; SI-NEXT: v_mov_b32_e32 v51, s93 +; SI-NEXT: v_mov_b32_e32 v52, s92 +; SI-NEXT: v_mov_b32_e32 v53, s91 +; SI-NEXT: v_mov_b32_e32 v54, s90 +; SI-NEXT: v_mov_b32_e32 v55, s89 +; SI-NEXT: v_mov_b32_e32 v40, s88 +; SI-NEXT: v_mov_b32_e32 v41, s79 +; SI-NEXT: v_mov_b32_e32 v42, s78 ; SI-NEXT: v_mov_b32_e32 v43, s77 ; SI-NEXT: v_mov_b32_e32 v44, s76 -; SI-NEXT: v_mov_b32_e32 v46, s75 -; SI-NEXT: v_mov_b32_e32 v45, s74 +; SI-NEXT: v_mov_b32_e32 v45, s75 +; SI-NEXT: v_mov_b32_e32 v46, s74 ; SI-NEXT: v_mov_b32_e32 v47, s73 ; SI-NEXT: v_mov_b32_e32 v56, s72 -; SI-NEXT: v_mov_b32_e32 v58, s63 -; SI-NEXT: v_mov_b32_e32 v57, s62 -; SI-NEXT: v_mov_b32_e32 v60, s61 -; SI-NEXT: v_mov_b32_e32 v59, s60 +; SI-NEXT: v_mov_b32_e32 v57, s63 +; SI-NEXT: v_mov_b32_e32 v58, s62 +; SI-NEXT: v_mov_b32_e32 v59, s61 +; SI-NEXT: v_mov_b32_e32 v60, s60 ; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: .LBB77_5: ; %end -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v47 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v41 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -130362,77 +131136,102 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload @@ -130672,213 +131471,224 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v21 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v27 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v61, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v59, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v57, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v46 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v6 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v4 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -130904,132 +131714,173 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 ; SI-NEXT: ; kill: killed $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: v_alignbit_b32 v0, v0, v63, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v61, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v59, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v57, 16 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_alignbit_b32 v25, v25, v38, 16 -; SI-NEXT: v_alignbit_b32 v26, v26, v35, 16 -; SI-NEXT: v_alignbit_b32 v27, v27, v42, 16 -; SI-NEXT: v_alignbit_b32 v28, v28, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v30, v30, v49, 16 -; SI-NEXT: v_alignbit_b32 v31, v31, v50, 16 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; kill: killed $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v19, v19, v20, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_alignbit_b32 v20, v20, v21, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_alignbit_b32 v21, v21, v22, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_alignbit_b32 v22, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v52 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: v_alignbit_b32 v23, v23, v55, 16 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_alignbit_b32 v12, v12, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v47, 16 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v50 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v48 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v36 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -131075,263 +131926,315 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; kill: killed $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: .LBB78_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB78_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB78_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -131365,12 +132268,12 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB78_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -131382,14 +132285,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -131400,14 +132303,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v14, v14, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -131418,14 +132321,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v13, v13, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -131436,14 +132339,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v12, v12, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -131454,14 +132357,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v11, v11, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -131472,14 +132375,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v10, v10, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -131490,14 +132393,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v9, v9, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -131508,14 +132411,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v8, v8, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -131526,14 +132429,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -131544,14 +132447,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -131562,14 +132465,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -131580,14 +132483,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v4, v4, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -131598,14 +132501,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v3, v3, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -131616,14 +132519,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v2, v2, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -131634,14 +132537,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -131652,15 +132555,15 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v0, v0, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -131671,14 +132574,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -131689,14 +132592,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v30, v30, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -131707,14 +132610,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v29, v29, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -131725,14 +132628,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v28, v28, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -131743,14 +132646,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v27, v27, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -131761,14 +132664,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v26, v26, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -131779,14 +132682,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v25, v25, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -131797,14 +132700,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -131815,14 +132718,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v23, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -131833,14 +132736,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v22, v22, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -131851,14 +132754,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v21, v21, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -131869,14 +132772,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v20, v20, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -131887,14 +132790,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v19, v19, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 @@ -131905,14 +132808,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v18, v18, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -131923,14 +132826,14 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v17, v17, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 ; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -131941,8 +132844,8 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v16, v16, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB78_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -133574,6 +134477,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v28 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill @@ -133590,533 +134494,621 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v43 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v58 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v59 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v62 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v50 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v49 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v48 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v53 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_mov_b32_e32 v38, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v25 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v22, v26 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v32 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v38 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: s_cbranch_execnz .LBB79_3 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v28, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: .LBB79_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -134134,41 +135126,28 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB79_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_mov_b32_e32 v52, v50 +; SI-NEXT: v_mov_b32_e32 v50, v30 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_mov_b32_e32 v45, v39 +; SI-NEXT: v_mov_b32_e32 v46, v38 +; SI-NEXT: v_mov_b32_e32 v56, v48 +; SI-NEXT: v_mov_b32_e32 v57, v41 +; SI-NEXT: v_mov_b32_e32 v60, v40 +; SI-NEXT: v_mov_b32_e32 v59, v55 +; SI-NEXT: v_mov_b32_e32 v63, v54 +; SI-NEXT: v_mov_b32_e32 v36, v53 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v62, v51 +; SI-NEXT: v_mov_b32_e32 v58, v28 +; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v37, v20 +; SI-NEXT: v_mov_b32_e32 v33, v22 +; SI-NEXT: v_mov_b32_e32 v43, v23 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_branch .LBB79_2 ; @@ -134213,12 +135192,12 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB79_3 ; VI-NEXT: .LBB79_2: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -134229,14 +135208,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v14 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v15, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -134247,14 +135226,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v13 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v14, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -134265,14 +135244,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v13, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -134283,14 +135262,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v12, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -134301,14 +135280,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v11, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -134319,14 +135298,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v10, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -134337,14 +135316,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v9, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -134355,14 +135334,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -134373,14 +135352,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v7, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -134391,14 +135370,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v5 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -134409,14 +135388,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v5, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -134427,14 +135406,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -134445,14 +135424,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -134463,14 +135442,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -134481,14 +135460,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -134499,14 +135478,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v31 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 @@ -134517,14 +135496,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v31, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v30 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 @@ -134535,14 +135514,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v29 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v30, v30, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v29 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 @@ -134553,14 +135532,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 ; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v28 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v29, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v28 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 @@ -134571,14 +135550,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 ; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v28, v28, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v27 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 @@ -134589,14 +135568,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 ; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v26 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v27, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v26 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 @@ -134607,14 +135586,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 ; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v25 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v26, v26, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v25 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 @@ -134625,14 +135604,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 ; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v24 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v25, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 @@ -134643,14 +135622,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 ; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v23 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v24, v24, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v23 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 @@ -134661,14 +135640,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 ; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v23, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v22 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 @@ -134679,14 +135658,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 ; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v21 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v22, v22, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v21 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 @@ -134697,14 +135676,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 ; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v21, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v20 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 @@ -134715,14 +135694,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 ; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v20, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 @@ -134733,14 +135712,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 ; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v19, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v32 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -134751,14 +135730,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v32, v32, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -134769,14 +135748,14 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v18, 16 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 ; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 @@ -134787,8 +135766,8 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v16, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB79_3: ; %end ; VI-NEXT: v_mov_b32_e32 v18, v32 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -143190,19 +144169,16 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -143251,14 +144227,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; SI-NEXT: s_waitcnt vmcnt(7) @@ -143293,14 +144266,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v26 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -143317,78 +144291,75 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -143398,37 +144369,36 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -143436,429 +144406,58 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v54 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v60 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v22, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -143936,730 +144535,1094 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_2 +; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v56 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v27 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v27, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v6, v5, v6 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v30 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v30, v5, v9 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v57, v9 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v19, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v19, v5, v26 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v31, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v15, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v23, v17, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v18, v3, v15 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v59 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_or_b32_e32 v17, v4, v3 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v33, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v42 -; SI-NEXT: v_or_b32_e32 v7, v5, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v51, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v8, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v11, v41, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v8, v5, v44 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v36 -; SI-NEXT: v_or_b32_e32 v14, v13, v61 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v5, v45, v5 -; SI-NEXT: v_or_b32_e32 v13, v62, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: .LBB88_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB88_4 -; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v63 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v59 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v58 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v60 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v43 -; SI-NEXT: v_or_b32_e32 v4, v61, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; SI-NEXT: v_or_b32_e32 v5, v62, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v40 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v54 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; SI-NEXT: v_or_b32_e32 v9, v57, v9 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v9, v5 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v9, v44, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_or_b32_e32 v10, v45, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; SI-NEXT: v_or_b32_e32 v10, v41, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v21 -; SI-NEXT: v_or_b32_e32 v10, v38, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; SI-NEXT: v_or_b32_e32 v11, v51, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v33, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v53 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_or_b32_e32 v7, v26, v7 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_or_b32_e32 v12, v31, v12 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v12, v12, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v19, v13 -; SI-NEXT: v_or_b32_e32 v13, v13, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v7, v15, v7 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v16, v7 -; SI-NEXT: v_or_b32_e32 v16, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v17, v7 -; SI-NEXT: v_or_b32_e32 v17, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v27 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v26, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v18, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v19, v7 -; SI-NEXT: v_or_b32_e32 v19, v7, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v20, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v21, v6, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v22, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v23, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v30, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v11 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v24, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v5, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v1, v6, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v25, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v6, v1 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v23, v27, v23 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v13, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v23, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v8, v1, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v9, v1, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v51, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v20 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_or_b32_e32 v12, v1, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v54 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v14, v1, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v62, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v15, v63, v10 +; SI-NEXT: v_or_b32_e32 v10, v3, v7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: .LBB88_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB88_4 +; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v59 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; SI-NEXT: v_or_b32_e32 v3, v63, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; SI-NEXT: v_or_b32_e32 v4, v62, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; SI-NEXT: v_or_b32_e32 v6, v57, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; SI-NEXT: v_or_b32_e32 v6, v44, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; SI-NEXT: v_or_b32_e32 v7, v45, v7 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v20 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; SI-NEXT: v_or_b32_e32 v10, v41, v10 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v10, v7 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v31 +; SI-NEXT: v_or_b32_e32 v10, v39, v10 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; SI-NEXT: v_or_b32_e32 v12, v51, v12 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; SI-NEXT: v_or_b32_e32 v12, v28, v12 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v9, v12, v9 +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v53 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v14, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v8, v14, v8 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v27, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v15, v8 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v23, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, v17, v8 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v16, v8 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v8, v18, v8 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v19, v8 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; SI-NEXT: v_or_b32_e32 v8, v20, v8 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v8, v13, v8 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v21, v13, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v11 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v22, v8, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v23, v8, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v24, v8, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v25, v8, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v5, v8, v5 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v11, v8 +; SI-NEXT: v_or_b32_e32 v26, v8, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v8, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v28, v8, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v1 +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, s7, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v28, v23 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s7, v24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload @@ -144710,7 +145673,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v27 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 @@ -144801,7 +145764,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v32 ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 @@ -144821,7 +145784,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 @@ -144831,387 +145794,435 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v2 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: .LBB88_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v35 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v34 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v30 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v32 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v29 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v26 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -149433,13 +150444,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: s_mov_b32 s73, s28 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v43, s19, 0 ; SI-NEXT: v_writelane_b32 v43, s18, 1 ; SI-NEXT: v_writelane_b32 v43, s17, 2 ; SI-NEXT: v_writelane_b32 v43, s16, 3 -; SI-NEXT: s_mov_b32 s60, s24 +; SI-NEXT: s_mov_b32 s79, s25 ; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 @@ -149464,8 +150475,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s77, s28 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: s_mov_b32 s58, s29 +; SI-NEXT: s_mov_b32 s61, s27 +; SI-NEXT: s_mov_b32 s77, s26 +; SI-NEXT: s_mov_b32 s88, s23 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 ; SI-NEXT: v_writelane_b32 v41, s82, 26 @@ -149478,7 +150491,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: s_mov_b32 s79, s26 +; SI-NEXT: s_mov_b32 s63, s21 ; SI-NEXT: v_readfirstlane_b32 s38, v20 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s39, v19 @@ -149504,9 +150517,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v5 ; SI-NEXT: v_readfirstlane_b32 s19, v6 -; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: v_readfirstlane_b32 s78, v4 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 @@ -149542,6 +150553,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_readfirstlane_b32 s90, v9 ; SI-NEXT: v_readfirstlane_b32 s91, v10 ; SI-NEXT: v_readfirstlane_b32 s92, v8 ; SI-NEXT: v_readfirstlane_b32 s93, v7 @@ -149578,39 +150591,38 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s75, v32 +; SI-NEXT: v_readfirstlane_b32 s56, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s61, v33 +; SI-NEXT: v_readfirstlane_b32 s75, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s43, v34 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s40, v35 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_readfirstlane_b32 s42, v36 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s63, v37 +; SI-NEXT: v_readfirstlane_b32 s62, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: v_writelane_b32 v43, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: v_readfirstlane_b32 s74, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s42, v38 +; SI-NEXT: v_readfirstlane_b32 s60, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s73, v39 +; SI-NEXT: v_readfirstlane_b32 s28, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s21, v48 +; SI-NEXT: v_readfirstlane_b32 s57, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s57, v49 +; SI-NEXT: v_readfirstlane_b32 s59, v49 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s13, v50 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: v_readfirstlane_b32 s76, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 @@ -149619,25 +150631,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s47, v32 +; SI-NEXT: v_readfirstlane_b32 s25, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v33 +; SI-NEXT: v_readfirstlane_b32 s46, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: v_writelane_b32 v43, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 21 @@ -149688,20 +150703,20 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s4, v40 ; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: v_writelane_b32 v43, s22, 34 -; SI-NEXT: v_writelane_b32 v43, s23, 35 -; SI-NEXT: v_writelane_b32 v43, s72, 36 +; SI-NEXT: v_writelane_b32 v43, s88, 35 +; SI-NEXT: v_writelane_b32 v43, s63, 36 ; SI-NEXT: v_writelane_b32 v43, s20, 37 -; SI-NEXT: v_writelane_b32 v43, s79, 38 -; SI-NEXT: v_writelane_b32 v43, s76, 39 -; SI-NEXT: v_writelane_b32 v43, s25, 40 -; SI-NEXT: v_writelane_b32 v43, s60, 41 -; SI-NEXT: v_writelane_b32 v43, s29, 42 -; SI-NEXT: v_writelane_b32 v43, s77, 43 +; SI-NEXT: v_writelane_b32 v43, s77, 38 +; SI-NEXT: v_writelane_b32 v43, s61, 39 +; SI-NEXT: v_writelane_b32 v43, s79, 40 +; SI-NEXT: v_writelane_b32 v43, s24, 41 +; SI-NEXT: v_writelane_b32 v43, s58, 42 +; SI-NEXT: v_writelane_b32 v43, s73, 43 ; SI-NEXT: v_writelane_b32 v43, s16, 44 ; SI-NEXT: v_writelane_b32 v43, s17, 45 ; SI-NEXT: v_writelane_b32 v43, s18, 46 ; SI-NEXT: v_writelane_b32 v43, s19, 47 -; SI-NEXT: v_writelane_b32 v43, s88, 48 +; SI-NEXT: v_writelane_b32 v43, s78, 48 ; SI-NEXT: v_writelane_b32 v43, s89, 49 ; SI-NEXT: v_writelane_b32 v43, s90, 50 ; SI-NEXT: v_writelane_b32 v43, s91, 51 @@ -149710,15 +150725,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v43, s94, 54 ; SI-NEXT: v_writelane_b32 v43, s95, 55 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s62, v33 +; SI-NEXT: v_readfirstlane_b32 s26, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s10, v34 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s66, v35 -; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s23, v31 ; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s58, v36 +; SI-NEXT: v_readfirstlane_b32 s29, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s69, v37 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -149758,9 +150773,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v43, s36, 62 ; SI-NEXT: v_writelane_b32 v43, s37, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s46, v32 +; SI-NEXT: v_readfirstlane_b32 s45, v32 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s96, v33 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -149768,7 +150783,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s41, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s56, v36 +; SI-NEXT: v_readfirstlane_b32 s72, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s87, v37 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -149784,7 +150799,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s26, v48 +; SI-NEXT: v_readfirstlane_b32 s47, v48 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v49 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -149843,25 +150858,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v42, s41, 28 ; SI-NEXT: v_writelane_b32 v42, s80, 29 ; SI-NEXT: v_writelane_b32 v42, s7, 30 -; SI-NEXT: v_writelane_b32 v42, s56, 31 -; SI-NEXT: v_writelane_b32 v42, s26, 32 +; SI-NEXT: v_writelane_b32 v42, s72, 31 +; SI-NEXT: v_writelane_b32 v42, s47, 32 ; SI-NEXT: v_writelane_b32 v42, s15, 33 ; SI-NEXT: v_writelane_b32 v42, s14, 34 ; SI-NEXT: v_writelane_b32 v42, s69, 35 ; SI-NEXT: v_writelane_b32 v42, s71, 36 ; SI-NEXT: v_writelane_b32 v42, s70, 37 ; SI-NEXT: v_writelane_b32 v42, s68, 38 -; SI-NEXT: v_writelane_b32 v42, s74, 39 -; SI-NEXT: v_writelane_b32 v42, s46, 40 +; SI-NEXT: v_writelane_b32 v42, s21, 39 +; SI-NEXT: v_writelane_b32 v42, s45, 40 ; SI-NEXT: v_writelane_b32 v42, s11, 41 ; SI-NEXT: v_writelane_b32 v42, s10, 42 -; SI-NEXT: v_writelane_b32 v42, s62, 43 +; SI-NEXT: v_writelane_b32 v42, s26, 43 ; SI-NEXT: v_writelane_b32 v42, s66, 44 -; SI-NEXT: v_writelane_b32 v42, s58, 45 -; SI-NEXT: v_writelane_b32 v42, s28, 46 +; SI-NEXT: v_writelane_b32 v42, s29, 45 +; SI-NEXT: v_writelane_b32 v42, s23, 46 ; SI-NEXT: v_writelane_b32 v42, s27, 47 -; SI-NEXT: v_writelane_b32 v42, s78, 48 -; SI-NEXT: v_writelane_b32 v42, s24, 49 +; SI-NEXT: v_writelane_b32 v42, s46, 48 +; SI-NEXT: v_writelane_b32 v42, s13, 49 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_readlane_b32 s4, v43, 3 @@ -149870,66 +150885,65 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 56 +; SI-NEXT: v_writelane_b32 v42, s4, 50 ; SI-NEXT: v_readlane_b32 s4, v43, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: v_readlane_b32 s5, v43, 0 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: v_writelane_b32 v42, s4, 51 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 58 +; SI-NEXT: s_lshl_b32 s6, s88, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 52 ; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 59 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 60 -; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: s_and_b32 s5, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_writelane_b32 v42, s5, 61 +; SI-NEXT: s_lshl_b32 s6, s79, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 53 +; SI-NEXT: s_or_b32 s4, s6, s5 ; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s61, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 54 +; SI-NEXT: s_or_b32 s4, s6, s5 +; SI-NEXT: s_and_b32 s5, s73, 0xff +; SI-NEXT: s_lshl_b32 s6, s58, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 62 +; SI-NEXT: v_writelane_b32 v42, s4, 55 +; SI-NEXT: s_or_b32 s4, s16, s6 ; SI-NEXT: s_and_b32 s6, s89, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s16, s88, 24 -; SI-NEXT: s_mov_b32 s4, s47 -; SI-NEXT: s_or_b32 s47, s16, s6 +; SI-NEXT: s_lshl_b32 s16, s78, 24 +; SI-NEXT: v_writelane_b32 v42, s4, 56 +; SI-NEXT: s_or_b32 s4, s16, s6 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s25, s16, s6 +; SI-NEXT: s_or_b32 s24, s16, s6 ; SI-NEXT: s_and_b32 s6, s93, 0xff ; SI-NEXT: s_lshl_b32 s16, s92, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s16, s90, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s92, s17, s16 +; SI-NEXT: s_or_b32 s78, s17, s16 ; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s76, s17, s16 +; SI-NEXT: s_or_b32 s79, s17, s16 ; SI-NEXT: s_and_b32 s16, s94, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s91, s17, s16 +; SI-NEXT: s_or_b32 s73, s17, s16 ; SI-NEXT: s_and_b32 s16, s35, 0xff ; SI-NEXT: s_lshl_b32 s17, s34, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 @@ -149940,33 +150954,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s17, s39, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_or_b32 s79, s18, s17 +; SI-NEXT: s_or_b32 s88, s18, s17 ; SI-NEXT: s_and_b32 s17, s36, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s93, s18, s17 +; SI-NEXT: s_or_b32 s89, s18, s17 ; SI-NEXT: s_and_b32 s17, s51, 0xff ; SI-NEXT: s_lshl_b32 s18, s50, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s48, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s89, s19, s18 +; SI-NEXT: v_writelane_b32 v42, s4, 57 +; SI-NEXT: s_mov_b32 s4, s59 +; SI-NEXT: s_or_b32 s59, s19, s18 ; SI-NEXT: s_and_b32 s18, s55, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_or_b32 s31, s19, s18 +; SI-NEXT: s_or_b32 s61, s19, s18 ; SI-NEXT: s_and_b32 s18, s52, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s53, 24 -; SI-NEXT: s_or_b32 s94, s19, s18 +; SI-NEXT: s_or_b32 s48, s19, s18 ; SI-NEXT: s_and_b32 s18, s84, 0xff ; SI-NEXT: s_lshl_b32 s19, s67, 8 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s19, s64, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_or_b32 s93, s20, s19 ; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s8, 24 @@ -149985,139 +151001,136 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s15, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_or_b32 s52, s20, s19 ; SI-NEXT: s_and_b32 s19, s82, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: s_or_b32 s23, s20, s19 -; SI-NEXT: s_and_b32 s19, s26, 0xff +; SI-NEXT: s_or_b32 s53, s20, s19 +; SI-NEXT: s_and_b32 s19, s47, 0xff ; SI-NEXT: s_lshl_b32 s20, s81, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 ; SI-NEXT: s_and_b32 s19, s99, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 50 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: s_or_b32 s54, s20, s19 +; SI-NEXT: s_and_b32 s19, s72, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s41, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_or_b32 s49, s20, s19 ; SI-NEXT: s_and_b32 s19, s98, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s96, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_or_b32 s7, s20, s19 -; SI-NEXT: s_and_b32 s19, s46, 0xff -; SI-NEXT: s_lshl_b32 s20, s74, 8 +; SI-NEXT: s_or_b32 s90, s20, s19 +; SI-NEXT: s_and_b32 s19, s45, 0xff +; SI-NEXT: s_lshl_b32 s20, s21, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 ; SI-NEXT: s_and_b32 s19, s71, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: s_or_b32 s72, s20, s19 +; SI-NEXT: s_or_b32 s92, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s68, 24 -; SI-NEXT: v_writelane_b32 v42, s7, 53 ; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s58, 0xff +; SI-NEXT: s_or_b32 s58, s20, s19 +; SI-NEXT: s_and_b32 s19, s29, 0xff ; SI-NEXT: s_lshl_b32 s20, s66, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 ; SI-NEXT: s_and_b32 s19, s10, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s62, 24 -; SI-NEXT: s_or_b32 s49, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s26, 24 +; SI-NEXT: s_or_b32 s63, s20, s19 ; SI-NEXT: s_and_b32 s19, s27, 0xff -; SI-NEXT: v_writelane_b32 v42, s9, 55 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 33 -; SI-NEXT: s_or_b32 s50, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 32 +; SI-NEXT: s_lshl_b32 s20, s23, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 33 +; SI-NEXT: s_or_b32 s10, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 32 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 31 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 31 ; SI-NEXT: s_or_b32 s51, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 30 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 30 +; SI-NEXT: s_lshl_b32 s20, s11, 8 +; SI-NEXT: v_readlane_b32 s11, v43, 29 ; SI-NEXT: s_or_b32 s86, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 28 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 27 -; SI-NEXT: s_or_b32 s52, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 26 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 27 +; SI-NEXT: s_or_b32 s55, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 26 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 25 -; SI-NEXT: s_or_b32 s53, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 24 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 25 +; SI-NEXT: s_or_b32 s14, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 23 -; SI-NEXT: s_or_b32 s54, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 22 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v43, 21 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 23 +; SI-NEXT: s_or_b32 s64, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 22 +; SI-NEXT: s_lshl_b32 s20, s11, 8 +; SI-NEXT: v_readlane_b32 s11, v43, 21 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 20 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v43, 19 -; SI-NEXT: s_or_b32 s55, s20, s19 -; SI-NEXT: s_mov_b32 s58, s9 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 18 +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 19 +; SI-NEXT: s_or_b32 s65, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 18 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: s_or_b32 s64, s20, s19 -; SI-NEXT: s_and_b32 s19, s78, 0xff +; SI-NEXT: s_lshl_b32 s20, s11, 24 +; SI-NEXT: v_readlane_b32 s11, v43, 17 +; SI-NEXT: s_or_b32 s15, s20, s19 +; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s24, 24 -; SI-NEXT: s_or_b32 s65, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff -; SI-NEXT: s_lshl_b32 s20, s45, 8 +; SI-NEXT: s_lshl_b32 s20, s46, 24 +; SI-NEXT: s_or_b32 s66, s20, s19 +; SI-NEXT: s_and_b32 s19, s25, 0xff +; SI-NEXT: s_lshl_b32 s20, s76, 8 ; SI-NEXT: s_or_b32 s26, s19, s20 ; SI-NEXT: s_and_b32 s19, s13, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s57, 24 -; SI-NEXT: s_or_b32 s66, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 +; SI-NEXT: s_lshl_b32 s20, s4, 24 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s42, 0xff -; SI-NEXT: v_readlane_b32 s88, v43, 17 +; SI-NEXT: s_and_b32 s19, s57, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s59, 24 +; SI-NEXT: s_lshl_b32 s20, s28, 24 ; SI-NEXT: s_or_b32 s68, s20, s19 -; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s88, 8 +; SI-NEXT: s_and_b32 s19, s60, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: s_lshl_b32 s20, s74, 24 +; SI-NEXT: s_or_b32 s69, s20, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff +; SI-NEXT: s_lshl_b32 s20, s42, 8 ; SI-NEXT: s_or_b32 s27, s19, s20 ; SI-NEXT: s_and_b32 s19, s40, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_or_b32 s69, s20, s19 -; SI-NEXT: s_and_b32 s19, s61, 0xff -; SI-NEXT: s_mov_b32 s39, s57 +; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: s_and_b32 s19, s75, 0xff +; SI-NEXT: s_mov_b32 s34, s57 ; SI-NEXT: s_mov_b32 s57, s7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s75, 24 +; SI-NEXT: s_lshl_b32 s20, s56, 24 ; SI-NEXT: v_readlane_b32 s7, v43, 16 -; SI-NEXT: s_or_b32 s70, s20, s19 +; SI-NEXT: s_mov_b32 s95, s42 +; SI-NEXT: s_mov_b32 s42, s40 +; SI-NEXT: s_mov_b32 s35, s56 +; SI-NEXT: s_mov_b32 s56, s10 +; SI-NEXT: s_or_b32 s40, s20, s19 ; SI-NEXT: s_mov_b32 s10, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: v_readlane_b32 s7, v43, 15 @@ -150125,23 +151138,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: v_readlane_b32 s7, v43, 14 -; SI-NEXT: s_or_b32 s62, s20, s19 +; SI-NEXT: s_mov_b32 s13, s93 +; SI-NEXT: s_mov_b32 s36, s43 +; SI-NEXT: s_mov_b32 s43, s15 +; SI-NEXT: s_or_b32 s93, s20, s19 ; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: v_readlane_b32 s7, v43, 13 ; SI-NEXT: s_mov_b32 s41, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 ; SI-NEXT: v_readlane_b32 s7, v43, 12 +; SI-NEXT: s_mov_b32 s46, s14 ; SI-NEXT: s_or_b32 s29, s19, s20 ; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: v_readlane_b32 s7, v43, 11 +; SI-NEXT: s_mov_b32 s47, s76 +; SI-NEXT: s_mov_b32 s76, s9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: v_readlane_b32 s7, v43, 10 ; SI-NEXT: s_or_b32 s80, s20, s19 -; SI-NEXT: s_mov_b32 s56, s7 +; SI-NEXT: s_mov_b32 s72, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: v_readlane_b32 s7, v43, 9 ; SI-NEXT: s_lshl_b32 s19, s19, 16 @@ -150156,43 +151175,32 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_mov_b32 s96, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: v_readlane_b32 s7, v43, 6 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s63, s93 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s61, s91 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: s_mov_b32 s75, s92 -; SI-NEXT: s_or_b32 s92, s20, s19 +; SI-NEXT: s_or_b32 s45, s20, s19 ; SI-NEXT: s_mov_b32 s98, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff ; SI-NEXT: v_readlane_b32 s7, v43, 5 ; SI-NEXT: s_mov_b32 s44, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 ; SI-NEXT: v_readlane_b32 s7, v43, 4 -; SI-NEXT: s_mov_b32 s48, s13 -; SI-NEXT: s_mov_b32 s13, s94 -; SI-NEXT: s_mov_b32 s94, s21 ; SI-NEXT: s_or_b32 s21, s19, s20 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s95, s4 +; SI-NEXT: s_mov_b32 s39, s4 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v42, 58 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s73, s12 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s59, s8 -; SI-NEXT: s_mov_b32 s30, s88 -; SI-NEXT: s_mov_b32 s88, s31 -; SI-NEXT: s_mov_b32 s78, s40 -; SI-NEXT: s_mov_b32 s31, s43 +; SI-NEXT: v_readlane_b32 s4, v42, 52 +; SI-NEXT: s_mov_b32 s91, s25 +; SI-NEXT: s_mov_b32 s31, s28 +; SI-NEXT: s_mov_b32 s50, s60 +; SI-NEXT: s_mov_b32 s60, s8 +; SI-NEXT: s_mov_b32 s94, s74 +; SI-NEXT: s_mov_b32 s74, s12 +; SI-NEXT: s_mov_b32 s38, s62 +; SI-NEXT: s_mov_b32 s37, s75 ; SI-NEXT: s_mov_b32 s12, s7 ; SI-NEXT: s_mov_b32 s7, s22 ; SI-NEXT: s_or_b32 s83, s20, s19 ; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s74, s5, 16 +; SI-NEXT: s_lshl_b32 s62, s5, 16 ; SI-NEXT: s_lshl_b32 s22, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 @@ -150204,16 +151212,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s97, s86, 16 ; SI-NEXT: s_lshl_b32 s28, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v42, 56 +; SI-NEXT: v_readlane_b32 s26, v42, 50 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v42, 57 -; SI-NEXT: v_readlane_b32 s35, v42, 61 +; SI-NEXT: v_readlane_b32 s27, v42, 51 +; SI-NEXT: v_readlane_b32 s30, v42, 56 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v42, 60 -; SI-NEXT: v_readlane_b32 s24, v42, 59 -; SI-NEXT: v_readlane_b32 s90, v42, 62 +; SI-NEXT: v_readlane_b32 s29, v42, 53 +; SI-NEXT: v_readlane_b32 s25, v42, 55 +; SI-NEXT: v_readlane_b32 s23, v42, 54 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: s_mov_b32 s21, s47 +; SI-NEXT: v_readlane_b32 s21, v42, 57 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s98, 3 @@ -150228,7 +151236,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s5, s56, 3 +; SI-NEXT: s_add_i32 s5, s72, 3 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s6, s81, 8 ; SI-NEXT: s_add_i32 s16, s82, 3 @@ -150252,9 +151260,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_add_i32 s16, s37, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s91, 8 +; SI-NEXT: s_lshl_b32 s17, s35, 8 ; SI-NEXT: s_add_i32 s18, s10, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff @@ -150264,34 +151272,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s36, 3 +; SI-NEXT: s_add_i32 s17, s38, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s30, 8 -; SI-NEXT: s_add_i32 s19, s78, 3 +; SI-NEXT: s_lshl_b32 s18, s95, 8 +; SI-NEXT: s_add_i32 s19, s42, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s31, 24 +; SI-NEXT: s_lshl_b32 s18, s36, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_add_i32 s18, s94, 3 +; SI-NEXT: s_add_i32 s18, s34, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s34, 8 -; SI-NEXT: s_add_i32 s20, s37, 3 +; SI-NEXT: s_lshl_b32 s19, s31, 8 +; SI-NEXT: s_add_i32 s20, s50, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s38, 24 +; SI-NEXT: s_lshl_b32 s19, s94, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_add_i32 s19, s95, 3 +; SI-NEXT: s_add_i32 s19, s91, 3 +; SI-NEXT: v_readlane_b32 s7, v42, 49 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s46, 8 -; SI-NEXT: s_add_i32 s22, s48, 3 +; SI-NEXT: s_lshl_b32 s20, s47, 8 +; SI-NEXT: s_add_i32 s22, s7, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_lshl_b32 s20, s39, 24 @@ -150299,15 +151308,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 ; SI-NEXT: s_and_b32 s19, s19, 0xffff +; SI-NEXT: v_readlane_b32 s7, v43, 19 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: s_add_i32 s20, s58, 3 +; SI-NEXT: s_add_i32 s20, s7, 3 ; SI-NEXT: v_readlane_b32 s7, v43, 18 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v42, 49 +; SI-NEXT: v_readlane_b32 s7, v42, 48 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_lshl_b32 s22, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v42, 48 +; SI-NEXT: v_readlane_b32 s7, v43, 17 ; SI-NEXT: s_add_i32 s23, s7, 3 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 16 @@ -150688,53 +151698,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s25, s25, 16 ; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: s_or_b32 s24, s24, s25 -; SI-NEXT: s_and_b32 s46, s46, 0xff ; SI-NEXT: s_or_b32 s20, s24, s20 ; SI-NEXT: v_readlane_b32 s24, v43, 3 -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_addk_i32 s56, 0x300 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_readlane_b32 s25, v43, 2 ; SI-NEXT: v_readlane_b32 s26, v43, 1 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 -; SI-NEXT: s_add_i32 s9, s9, 0x3000000 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s56, s46, s47 -; SI-NEXT: s_add_i32 s47, s58, 0x3000000 -; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_and_b32 s46, s46, 0xff ; SI-NEXT: s_or_b32 s24, s25, s24 ; SI-NEXT: v_readlane_b32 s25, v43, 0 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: s_and_b32 s73, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s9, 16 -; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_lshl_b32 s46, s46, 16 +; SI-NEXT: s_addk_i32 s56, 0x300 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_and_b32 s63, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s17, 16 -; SI-NEXT: v_writelane_b32 v42, s9, 50 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_lshl_b32 s7, s10, 16 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_or_b32 s46, s47, s46 +; SI-NEXT: s_and_b32 s47, s56, 0xffff ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: v_writelane_b32 v42, s7, 51 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_or_b32 s56, s46, s47 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: v_writelane_b32 v42, s7, 52 -; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 +; SI-NEXT: s_add_i32 s47, s58, 0x3000000 ; SI-NEXT: s_add_i32 s56, s56, 0x3000000 ; SI-NEXT: s_add_i32 s57, s57, 0x3000000 +; SI-NEXT: s_add_i32 s58, s59, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 ; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 @@ -150743,291 +151741,323 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v42, s7, 53 -; SI-NEXT: s_lshl_b32 s7, s8, 16 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s29, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s35, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s90, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s22, 16 -; SI-NEXT: s_and_b32 s25, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s25, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_and_b32 s30, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s62, s22, 16 +; SI-NEXT: s_and_b32 s24, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s75, s19, 0xffff0000 +; SI-NEXT: s_and_b32 s78, s19, 0xffff0000 ; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s61, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s18, 16 +; SI-NEXT: s_and_b32 s73, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s79, s18, 16 ; SI-NEXT: s_and_b32 s77, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 +; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s88, s17, 16 +; SI-NEXT: s_and_b32 s59, s13, 0xffff0000 ; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s12, 16 -; SI-NEXT: s_and_b32 s60, s11, 0xffff0000 +; SI-NEXT: s_and_b32 s48, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s61, s12, 16 +; SI-NEXT: s_and_b32 s13, s11, 0xffff0000 ; SI-NEXT: s_lshl_b32 s18, s11, 16 -; SI-NEXT: s_and_b32 s23, s10, 0xffff0000 +; SI-NEXT: s_and_b32 s74, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s60, s9, 16 +; SI-NEXT: s_and_b32 s76, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_and_b32 s53, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s10, 16 +; SI-NEXT: s_and_b32 s54, s6, 0xffff0000 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 54 -; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 +; SI-NEXT: s_and_b32 s90, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s8, 16 +; SI-NEXT: s_and_b32 s92, s58, 0xffff0000 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: s_and_b32 s58, s57, 0xffff0000 ; SI-NEXT: s_lshl_b32 s57, s57, 16 -; SI-NEXT: s_and_b32 s49, s56, 0xffff0000 +; SI-NEXT: s_and_b32 s63, s56, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s56, 16 ; SI-NEXT: s_and_b32 s51, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s50, s47, 16 -; SI-NEXT: s_and_b32 s52, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s56, s47, 16 +; SI-NEXT: s_and_b32 s55, s46, 0xffff0000 ; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s54, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s53, s45, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 +; SI-NEXT: s_and_b32 s64, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s45, 16 +; SI-NEXT: s_and_b32 s65, s44, 0xffff0000 ; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s64, s43, 16 -; SI-NEXT: s_and_b32 s66, s42, 0xffff0000 +; SI-NEXT: s_and_b32 s66, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s43, 16 +; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 -; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s67, s41, 16 -; SI-NEXT: s_and_b32 s69, s40, 0xffff0000 +; SI-NEXT: s_and_b32 s69, s41, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s41, 16 +; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 -; SI-NEXT: s_and_b32 s62, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s15, 16 +; SI-NEXT: s_and_b32 s93, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s40, s15, 16 ; SI-NEXT: s_and_b32 s80, s14, 0xffff0000 ; SI-NEXT: s_lshl_b32 s85, s14, 16 -; SI-NEXT: s_and_b32 s92, s5, 0xffff0000 +; SI-NEXT: s_and_b32 s45, s5, 0xffff0000 ; SI-NEXT: s_lshl_b32 s11, s5, 16 ; SI-NEXT: s_and_b32 s83, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v42, s7, 55 ; SI-NEXT: .LBB89_3: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s27 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s25 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s77 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s79 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s59 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 50 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 -; SI-NEXT: v_readlane_b32 s4, v42, 51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 52 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s54 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v42, 54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s72 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s92 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v42, 55 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s58 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s8 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s8 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s51 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s50 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s97 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s53 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s65 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s66 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s64 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s87 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s87 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s69 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s40 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s93 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s80 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s80 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s11 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s45 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s83 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s84 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s83 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s84 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -151075,103 +152105,95 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: s_mov_b32 s7, s6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s58, v43, 19 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: s_mov_b32 s95, s47 -; SI-NEXT: s_mov_b32 s94, s21 -; SI-NEXT: s_mov_b32 s93, s61 -; SI-NEXT: s_mov_b32 s34, s73 -; SI-NEXT: s_mov_b32 s91, s75 -; SI-NEXT: v_readlane_b32 s56, v43, 10 -; SI-NEXT: s_mov_b32 s36, s63 -; SI-NEXT: s_mov_b32 s38, s59 -; SI-NEXT: s_mov_b32 s37, s42 -; SI-NEXT: v_readlane_b32 s30, v43, 17 +; SI-NEXT: s_mov_b32 s91, s25 +; SI-NEXT: s_mov_b32 s34, s57 +; SI-NEXT: s_mov_b32 s37, s75 +; SI-NEXT: s_mov_b32 s31, s28 +; SI-NEXT: s_mov_b32 s35, s56 +; SI-NEXT: v_readlane_b32 s72, v43, 10 +; SI-NEXT: s_mov_b32 s38, s62 +; SI-NEXT: s_mov_b32 s94, s74 +; SI-NEXT: s_mov_b32 s50, s60 +; SI-NEXT: s_mov_b32 s95, s42 ; SI-NEXT: v_readlane_b32 s98, v43, 6 -; SI-NEXT: s_mov_b32 s46, s45 -; SI-NEXT: s_mov_b32 s31, s43 -; SI-NEXT: s_mov_b32 s78, s40 +; SI-NEXT: s_mov_b32 s47, s76 +; SI-NEXT: s_mov_b32 s36, s43 +; SI-NEXT: s_mov_b32 s42, s40 ; SI-NEXT: v_readlane_b32 s15, v43, 14 -; SI-NEXT: s_mov_b32 s39, s57 -; SI-NEXT: s_mov_b32 s48, s13 +; SI-NEXT: s_mov_b32 s39, s59 ; SI-NEXT: v_readlane_b32 s41, v43, 13 ; SI-NEXT: v_readlane_b32 s44, v43, 5 ; SI-NEXT: v_readlane_b32 s9, v43, 11 ; SI-NEXT: v_readlane_b32 s14, v43, 12 ; SI-NEXT: v_readlane_b32 s81, v43, 9 ; SI-NEXT: v_readlane_b32 s10, v43, 16 +; SI-NEXT: s_mov_b32 s7, s6 ; SI-NEXT: v_readlane_b32 s12, v43, 4 ; SI-NEXT: v_readlane_b32 s96, v43, 7 ; SI-NEXT: v_readlane_b32 s82, v43, 8 ; SI-NEXT: v_readlane_b32 s71, v43, 15 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr67 ; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr83 ; SI-NEXT: s_branch .LBB89_2 @@ -154937,1990 +155959,2106 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v24 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v36 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v63 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v59 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v60 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v37 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; kill: killed $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v4 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v5 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v6 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v10 -; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v8 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v3 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_alignbit_b32 v38, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v31, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v32, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v35, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v29, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v30, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v32, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v27, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v28, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v29, v1, v2, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v25, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v26, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v26, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v23, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v24, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v40, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v21, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v22, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v19, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v20, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v54, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v17, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v18, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v18, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v15, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v16, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v52, v1, v2, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v13, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v14, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v15, v1, v57, 16 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v50, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v11, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v12, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v9, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 +; SI-NEXT: v_or_b32_e32 v10, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_or_b32_e32 v7, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v8, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v45 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; SI-NEXT: v_or_b32_e32 v3, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; SI-NEXT: v_or_b32_e32 v4, v2, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_or_b32_e32 v35, v2, v33 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v42 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v37 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v61, 16 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v48, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v37 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v11, v1, v60, 16 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v37, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v39 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v8, v1, v51, 16 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v34, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v39 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_alignbit_b32 v5, v1, v53, 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v31, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v63 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_alignbit_b32 v4, v1, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_alignbit_b32 v28, v1, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v63 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_alignbit_b32 v3, v1, v41, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v25, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_alignbit_b32 v2, v1, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_alignbit_b32 v22, v1, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v20, v7, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v44 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_alignbit_b32 v17, v7, v9, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 -; SI-NEXT: v_alignbit_b32 v14, v7, v27, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v10, v7, v39, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v44 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v43, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v7, v7, v55, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v58 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v33 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v47 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v57 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v42 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v59 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v49 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v51 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v62 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v45 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v24 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v36 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v62 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: .LBB90_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB90_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v63 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v35, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v5, v8, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_alignbit_b32 v8, v11, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v34 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v34 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v11, v14, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v16 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v19 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v17, v17, v15, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v50 -; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v52 -; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v54 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v40 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v41 -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v32 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v57 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v15, v20, v15, 16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v20, v18, 16 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; SI-NEXT: v_alignbit_b32 v18, v23, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v21, 24, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_alignbit_b32 v23, v26, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v62 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v26 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; SI-NEXT: v_alignbit_b32 v26, v29, v26, 16 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v27 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v27, v1, v27 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v30 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v28, v1, v28 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v29, v1, v29 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_alignbit_b32 v32, v35, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v35, v34, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v33 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v30, v1, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v31, v1, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; SI-NEXT: v_alignbit_b32 v35, v38, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v36 -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v37, v38, v37, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v32, v1, v32 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v36, 24, v36 -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 -; SI-NEXT: v_alignbit_b32 v38, v49, v38, 16 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v39 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v48, v49, v48, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v39 -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v50, v50, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v52, v52, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v54, v54, v49, 16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 -; SI-NEXT: v_alignbit_b32 v40, v40, v49, 16 -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v40, v38, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v54, v35, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v37 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v52, v32, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v50, v29, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v48, v26, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v37, v23, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v34, v18, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v32, v31, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v31, v15, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v30, v29, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v28, v13, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v28, v27, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v25, v11, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v26, v25, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v22, v8, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v24, v23, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v20, v5, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v22, v21, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v20, v19, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v17, v4, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v14, v3, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v10, v2, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 24 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v7, v1, 8 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v40 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v52 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v50 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v48 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v31 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v25 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v35, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v35, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v2, v35, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v22 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v41 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v17 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v55 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v14 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v53 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v10 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v51 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v40 +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v55 +; SI-NEXT: v_mov_b32_e32 v39, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v7 -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: .LBB90_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xff, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v33, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v40 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v31, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v54 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v29, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v29 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v27, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v26 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v25, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v48 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v23, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v37 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v34 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v15 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v19, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v31 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v13 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v17, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v28 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v1, v1, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v15, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v25 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v13, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v22 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v6, v6, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v47 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -156968,1187 +158106,1285 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr63 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr36 ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 -; VI-NEXT: ; kill: killed $vgpr33 -; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB90_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v3 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v2 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v58 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v58 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v57 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v57 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v30 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v28 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v23 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v19 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v21 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[15:16] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v20 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[13:14] +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[9:10] +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v26 +; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[1:2] +; VI-NEXT: v_mov_b32_e32 v54, v50 +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[57:58] +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[29:30] +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[27:28] +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[25:26] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v24 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[5:6] +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, v46 +; VI-NEXT: v_mov_b32_e32 v46, v45 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, v46 -; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v46, v63 -; VI-NEXT: v_mov_b32_e32 v63, v50 -; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28] -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8 -; VI-NEXT: v_mov_b32_e32 v51, v57 -; VI-NEXT: v_mov_b32_e32 v50, v56 -; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] -; VI-NEXT: v_mov_b32_e32 v57, v43 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24] -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22] -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20] -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: v_mov_b32_e32 v47, v34 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v58 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 ; VI-NEXT: .LBB90_2: ; %Flow -; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_xor_b64 exec, exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB90_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v18, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v18, 16, 1 ; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v18 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v18 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v18 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v18, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v18, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v34, v18, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v17 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v17, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v33, v17, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v20 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v20 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v20 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v20 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v20, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v20, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v34, v20, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v19 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v19 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v19 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v19 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v19, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v19, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v33, v19, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v22 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v22 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v22 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v22 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v22 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v22, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v22, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v34, v22, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v21 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v21 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v21 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v21 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v21 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v21, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v21, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v33, v21, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v24 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v24 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v24 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v24 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v24 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v24, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v24, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v34, v24, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v23 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v23 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v23 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v23 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v23 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v23, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v23, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v33, v23, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v26 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v26 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v26 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v26 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v26 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v26, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v26, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v34, v26, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v25 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v25 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v25 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v25 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v25 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v25, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v25, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v33, v25, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v28 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v28 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v28 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v28 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v28 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v28, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v28, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v34, v28, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v27 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v27 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v27 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v27 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v27, v32, v33, vcc +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v27, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v33, v27, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v30 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v30 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v30, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v29 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v29, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v35, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v29 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v29, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v29, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; VI-NEXT: v_or_b32_e32 v32, v30, v33 +; VI-NEXT: v_or_b32_e32 v31, v29, v31 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v58 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v35, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v57 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v32 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_alignbit_b32 v32, v32, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v57 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v36, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; VI-NEXT: v_or_b32_e32 v49, v58, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; VI-NEXT: v_or_b32_e32 v48, v57, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v2 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v2 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v2, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v2 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v2, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v1 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v1 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v1 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v51, v2, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; VI-NEXT: v_or_b32_e32 v50, v1, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v4 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v4 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v4, 16, 1 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v4 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v4, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v3 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v3 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v3 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v3, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v3 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v53, v4, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; VI-NEXT: v_or_b32_e32 v52, v3, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v6 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v6 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v38, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v6 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v6, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v5 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v5 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v5, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v5 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v38 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v43, v6, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; VI-NEXT: v_or_b32_e32 v42, v5, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v8 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v8, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v41, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v8 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v8, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v7 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v7 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v7 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v7 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v7, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v41 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v40, v8, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 +; VI-NEXT: v_or_b32_e32 v39, v7, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v10 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v10, 16, 1 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v35, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v10 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v10, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v9 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v9 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v9 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v9, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v62, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v9 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v9, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v55, v10, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v62 +; VI-NEXT: v_or_b32_e32 v54, v9, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v12 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v12, 16, 1 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v12 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v12, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v11 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v11 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v11, 16, 1 +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v36, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v11 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v34 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v45, v12, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v36 +; VI-NEXT: v_or_b32_e32 v44, v11, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v14 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v14 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v13 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v13 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v13 +; VI-NEXT: v_or_b32_e32 v46, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_bfe_u32 v31, v13, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v63, v32, v46, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v13 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_or_b32_e32 v61, v14, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v63 +; VI-NEXT: v_or_b32_e32 v60, v13, v31 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_or_b32_e32 v46, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v16 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v16 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v46, vcc +; VI-NEXT: v_bfe_u32 v32, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v16 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v46, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v33, 16 -; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_cndmask_b32_e32 v16, v32, v46, vcc +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v46, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v32 +; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_or_b32_e32 v47, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v15 -; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v46, v47, vcc +; VI-NEXT: v_bfe_u32 v46, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v15 +; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46 +; VI-NEXT: v_or_b32_e32 v47, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v34, v35, vcc +; VI-NEXT: v_cndmask_b32_e32 v15, v46, v47, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_and_b32_e32 v46, 0xffff0000, v31 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v33, 16 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v31 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17 -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: .LBB90_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v47, v16, v46 +; VI-NEXT: v_and_b32_e32 v46, 0xffff0000, v32 +; VI-NEXT: v_or_b32_e32 v46, v15, v46 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v47 +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v46 +; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[46:47] +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v60 +; VI-NEXT: v_lshrrev_b64 v[59:60], 24, v[60:61] +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v45 +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v44 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[44:45] +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v55 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v54 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[54:55] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v40 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v39 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v43 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v42 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v53 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v52 +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v51 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v50 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[53:54], 24, v[50:51] +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v48 +; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[48:49] +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[42:43] +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v31 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v61 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v63 +; VI-NEXT: v_lshrrev_b32_e32 v47, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v33 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[39:40] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[49:50] +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[49:50] +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b64 v[48:49], 24, v[49:50] +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v50 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v49 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[49:50] +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v51 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v51 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[50:51] +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v34 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v34 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v36 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v35 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v41 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v41 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v51 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v50 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v38 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v38 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v50 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v50 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v32 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v43 -; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v43 +; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v31 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: .LBB90_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v50 +; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v53 +; VI-NEXT: v_or_b32_sdwa v33, v33, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158159,242 +159395,231 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v47 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -158402,36 +159627,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -158442,39 +159664,57 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -162540,484 +163780,613 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50 ; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v51 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v55 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v40 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v41 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v43 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s23 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v43, v36 -; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v43, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v32, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v17, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_mov_b32_e32 v11, v30 +; SI-NEXT: v_or_b32_e32 v30, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v15, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16 -; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 -; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 -; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_mov_b32_e32 v19, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; SI-NEXT: v_mov_b32_e32 v23, v26 +; SI-NEXT: v_or_b32_e32 v26, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v45, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v7, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v22, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v56, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_mov_b32_e32 v47, v18 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v20, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v13, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: v_or_b32_e32 v18, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_or_b32_e32 v9, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_mov_b32_e32 v60, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_or_b32_e32 v57, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v14, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v62, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; SI-NEXT: v_or_b32_e32 v44, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v10, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; SI-NEXT: v_or_b32_e32 v59, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_mov_b32_e32 v49, v5 +; SI-NEXT: v_or_b32_e32 v5, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; SI-NEXT: v_or_b32_e32 v6, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_or_b32_e32 v3, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v27, v21, v15, 24 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_mov_b32_e32 v58, v33 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v42 +; SI-NEXT: v_mov_b32_e32 v48, v23 +; SI-NEXT: v_alignbit_b32 v23, v6, v5, 16 +; SI-NEXT: v_mov_b32_e32 v38, v19 +; SI-NEXT: v_alignbit_b32 v24, v32, v43, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v24, v32, v43, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v24, v30, v17, 24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v24, v30, v17, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v24, v30, v17, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v24, v21, v15, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v24, v21, v15, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v14 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v10 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v8 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v6 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_alignbit_b32 v37, v8, v59, 24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v16 +; SI-NEXT: v_alignbit_b32 v36, v20, v56, 16 +; SI-NEXT: v_alignbit_b32 v61, v6, v5, 8 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v45 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_alignbit_b32 v27, v26, v11, 24 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v26, v11, 16 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v26, v11, 8 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_or_b32_e32 v2, v2, v33 +; SI-NEXT: v_alignbit_b32 v33, v22, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; SI-NEXT: v_mov_b32_e32 v17, v63 -; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 -; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v22, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v22, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v20, v56, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v20, v56, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v18, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v18, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16 -; SI-NEXT: v_mov_b32_e32 v45, v8 -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8 -; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v18, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v16, v9, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v16, v9, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v16, v9, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v14, v57, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v14, v57, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v14, v57, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v12, v62, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v12, v62, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v12, v62, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v10, v44, 24 +; SI-NEXT: v_alignbit_b32 v29, v2, v1, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v10, v44, 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v22 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v10, v44, 8 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v20 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v8, v59, 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v18 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v8, v59, 8 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v52 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24 +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v54 +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8 -; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v33 +; SI-NEXT: v_mov_b32_e32 v33, v23 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v54 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v47 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v60 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v60 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v49 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_alignbit_b32 v27, v45, v11, 24 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v31, v45, v11, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v45, v11, 8 +; SI-NEXT: v_mov_b32_e32 v23, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v41 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v41 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v63 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v63 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v46 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v46 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v48 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v48 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 24 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v32 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v58 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v4 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v58 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v21 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_alignbit_b32 v35, v4, v3, 8 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v23, v48 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v26 +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v47 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63 -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v34, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v39, v24 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v50 +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 8 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v38, v49 +; SI-NEXT: v_mov_b32_e32 v49, v37 +; SI-NEXT: v_mov_b32_e32 v58, v35 +; SI-NEXT: v_alignbit_b32 v35, v32, v43, 24 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; kill: killed $vgpr37 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -163155,1143 +164524,1123 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v17, v63 -; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 ; SI-NEXT: .LBB91_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v17 -; SI-NEXT: v_mov_b32_e32 v54, v61 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v34, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v36 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v49 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v52 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v42 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v39 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v58, v4, v3, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v61, v6, v5, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31 -; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v59, v7, v8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v44, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32 -; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v62, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v50 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v31 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v40 ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v55 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v57, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v48 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v9, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v13, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v56, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v42 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v42, v4, v3, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v7, v21, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_or_b32_e32 v45, v24, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v45 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v40 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v52 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v55 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40 -; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43 -; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v54 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v53 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56 -; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v15, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v21, v28, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v21, v15, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v26, v11, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v26, v11, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43 -; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v26, v11, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v45, v23, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40 -; SI-NEXT: v_mov_b32_e32 v40, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54 -; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v45, v23, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v18 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v14 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_or_b32_e32 v17, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_or_b32_e32 v30, v30, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; SI-NEXT: v_or_b32_e32 v43, v31, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v32, v32, v41 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v39 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v55 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v54 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v51 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v50 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v49 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v48 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v38 +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v33 +; SI-NEXT: v_alignbit_b32 v33, v22, v7, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v22, v7, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v22, v7, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v20, v56, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v20, v56, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v18, v13, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v18, v13, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v18, v13, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v16, v9, 24 +; SI-NEXT: v_alignbit_b32 v24, v32, v43, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v16, v9, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v30, v17, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v16, v9, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v30, v17, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v14, v57, 24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v30, v17, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v14, v57, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v21, v15, 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v14, v57, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v21, v15, 8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v12, v62, 24 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v37 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v12, v62, 16 +; SI-NEXT: v_alignbit_b32 v24, v4, v3, 24 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v12, v62, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v2, v1, 24 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v10, v44, 24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v32 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v10, v44, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v30 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v10, v44, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v21 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v8 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v8, v59, 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v26 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v51 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v49 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v8, v59, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v22 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 24, v50 +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v48, 24, v48 +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v35, v32, v43, 24 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_alignbit_b32 v41, v32, v43, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v34, v45, v23, 16 +; SI-NEXT: v_alignbit_b32 v36, v20, v56, 16 +; SI-NEXT: v_alignbit_b32 v49, v8, v59, 24 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v52, v6, v5, 24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v40, v2, v1, 8 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v20 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v2 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v43 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v41 +; SI-NEXT: v_or_b32_e32 v31, v31, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v35 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28 -; SI-NEXT: v_or_b32_e32 v32, v36, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v36, 0xff, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29 -; SI-NEXT: v_or_b32_e32 v36, v56, v36 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v48, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; SI-NEXT: v_or_b32_e32 v37, v37, v48 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 +; SI-NEXT: v_and_b32_e32 v31, 0xff, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v24 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v32, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v24 +; SI-NEXT: v_or_b32_e32 v32, v35, v32 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v56 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v17 +; SI-NEXT: v_or_b32_e32 v31, v32, v31 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v17 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v17 +; SI-NEXT: v_or_b32_e32 v30, v31, v30 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v57 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; SI-NEXT: v_or_b32_e32 v32, v33, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v15 +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v60 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v15 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v15 +; SI-NEXT: v_or_b32_e32 v28, v29, v28 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v44 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xff, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v11 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v11 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v11 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v11 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v45 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v11 +; SI-NEXT: v_or_b32_e32 v24, v25, v24 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v57 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v11 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; SI-NEXT: v_or_b32_e32 v21, v22, v21 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v11 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_or_b32_e32 v20, v21, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v7 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v38 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v52 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v38 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -164322,8 +165671,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: v_writelane_b32 v63, s30, 0 ; VI-NEXT: v_writelane_b32 v63, s31, 1 @@ -164358,24 +165707,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_writelane_b32 v63, s86, 30 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 ; VI-NEXT: v_writelane_b32 v63, s87, 31 -; VI-NEXT: v_readfirstlane_b32 s44, v3 -; VI-NEXT: v_readfirstlane_b32 s45, v4 -; VI-NEXT: v_readfirstlane_b32 s42, v5 -; VI-NEXT: v_readfirstlane_b32 s43, v6 -; VI-NEXT: v_readfirstlane_b32 s40, v7 -; VI-NEXT: v_readfirstlane_b32 s41, v8 -; VI-NEXT: v_readfirstlane_b32 s14, v9 -; VI-NEXT: v_readfirstlane_b32 s15, v10 -; VI-NEXT: v_readfirstlane_b32 s12, v11 -; VI-NEXT: v_readfirstlane_b32 s13, v12 -; VI-NEXT: v_readfirstlane_b32 s10, v13 -; VI-NEXT: v_readfirstlane_b32 s11, v14 -; VI-NEXT: v_readfirstlane_b32 s8, v15 -; VI-NEXT: v_readfirstlane_b32 s9, v16 -; VI-NEXT: v_readfirstlane_b32 s6, v17 -; VI-NEXT: v_readfirstlane_b32 s7, v18 +; VI-NEXT: v_readfirstlane_b32 s76, v3 +; VI-NEXT: v_readfirstlane_b32 s77, v4 +; VI-NEXT: v_readfirstlane_b32 s74, v5 +; VI-NEXT: v_readfirstlane_b32 s75, v6 +; VI-NEXT: v_readfirstlane_b32 s72, v7 +; VI-NEXT: v_readfirstlane_b32 s73, v8 +; VI-NEXT: v_readfirstlane_b32 s62, v9 +; VI-NEXT: v_readfirstlane_b32 s63, v10 +; VI-NEXT: v_readfirstlane_b32 s60, v11 +; VI-NEXT: v_readfirstlane_b32 s61, v12 +; VI-NEXT: v_readfirstlane_b32 s58, v13 +; VI-NEXT: v_readfirstlane_b32 s59, v14 +; VI-NEXT: v_readfirstlane_b32 s56, v15 +; VI-NEXT: v_readfirstlane_b32 s57, v16 +; VI-NEXT: v_readfirstlane_b32 s44, v17 +; VI-NEXT: v_readfirstlane_b32 s45, v18 ; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill @@ -164394,954 +165743,1003 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; VI-NEXT: s_cbranch_scc0 .LBB91_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s46, s5, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 57 -; VI-NEXT: s_lshr_b32 s46, s5, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 56 -; VI-NEXT: s_lshr_b32 s46, s5, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 55 -; VI-NEXT: s_lshr_b32 s46, s4, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 54 -; VI-NEXT: s_lshr_b32 s46, s4, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 53 -; VI-NEXT: s_lshr_b32 s46, s29, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 52 -; VI-NEXT: s_lshr_b32 s46, s29, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 51 -; VI-NEXT: s_lshr_b32 s46, s29, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 50 -; VI-NEXT: s_lshr_b32 s46, s28, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 49 -; VI-NEXT: s_lshr_b32 s46, s28, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 48 -; VI-NEXT: s_lshr_b32 s46, s27, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 47 -; VI-NEXT: s_lshr_b32 s46, s27, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 46 -; VI-NEXT: s_lshr_b32 s46, s27, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 45 -; VI-NEXT: s_lshr_b32 s46, s26, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 44 -; VI-NEXT: s_lshr_b32 s46, s26, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 43 -; VI-NEXT: s_lshr_b32 s46, s25, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 42 -; VI-NEXT: s_lshr_b32 s46, s25, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 41 -; VI-NEXT: s_lshr_b32 s46, s25, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 40 -; VI-NEXT: s_lshr_b32 s46, s24, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 39 -; VI-NEXT: s_lshr_b32 s46, s24, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 38 -; VI-NEXT: s_lshr_b32 s46, s23, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 37 -; VI-NEXT: s_lshr_b32 s46, s23, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 36 -; VI-NEXT: s_lshr_b32 s46, s23, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 35 -; VI-NEXT: s_lshr_b32 s46, s22, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 34 -; VI-NEXT: s_lshr_b32 s46, s22, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 33 -; VI-NEXT: s_lshr_b32 s46, s21, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 32 -; VI-NEXT: s_lshr_b32 s46, s21, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 31 -; VI-NEXT: s_lshr_b32 s46, s21, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 30 -; VI-NEXT: s_lshr_b32 s46, s20, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 29 -; VI-NEXT: s_lshr_b32 s46, s20, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 28 -; VI-NEXT: s_lshr_b32 s46, s19, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 27 -; VI-NEXT: s_lshr_b32 s46, s19, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 26 -; VI-NEXT: s_lshr_b32 s46, s19, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 25 -; VI-NEXT: s_lshr_b32 s46, s18, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 24 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 23 -; VI-NEXT: s_lshr_b32 s46, s17, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 22 -; VI-NEXT: s_lshr_b32 s46, s17, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 21 -; VI-NEXT: s_lshr_b32 s46, s17, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 20 -; VI-NEXT: s_lshr_b32 s46, s16, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 19 -; VI-NEXT: s_lshr_b32 s46, s16, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 18 -; VI-NEXT: s_lshr_b32 s46, s7, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 17 -; VI-NEXT: s_lshr_b32 s46, s7, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 16 -; VI-NEXT: s_lshr_b32 s46, s7, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 15 -; VI-NEXT: s_lshr_b32 s46, s6, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 14 -; VI-NEXT: s_lshr_b32 s46, s6, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 13 -; VI-NEXT: s_lshr_b32 s46, s9, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 12 -; VI-NEXT: s_lshr_b32 s46, s9, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 11 -; VI-NEXT: s_lshr_b32 s46, s9, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 10 -; VI-NEXT: s_lshr_b32 s46, s8, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 9 -; VI-NEXT: s_lshr_b32 s46, s8, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 8 -; VI-NEXT: s_lshr_b32 s46, s11, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 7 -; VI-NEXT: s_lshr_b32 s46, s11, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 6 -; VI-NEXT: s_lshr_b32 s46, s11, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 5 -; VI-NEXT: s_lshr_b32 s46, s10, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 4 -; VI-NEXT: s_lshr_b32 s46, s10, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 3 -; VI-NEXT: s_lshr_b32 s46, s13, 24 -; VI-NEXT: v_writelane_b32 v62, s46, 2 -; VI-NEXT: s_lshr_b32 s46, s13, 16 -; VI-NEXT: v_writelane_b32 v62, s46, 1 -; VI-NEXT: s_lshr_b32 s46, s12, 16 -; VI-NEXT: s_lshr_b32 s80, s13, 8 -; VI-NEXT: v_writelane_b32 v62, s46, 0 -; VI-NEXT: s_lshr_b32 s81, s12, 8 -; VI-NEXT: s_lshr_b32 s82, s15, 24 -; VI-NEXT: s_lshr_b32 s83, s15, 16 -; VI-NEXT: s_lshr_b32 s85, s15, 8 -; VI-NEXT: s_lshr_b32 s84, s14, 16 -; VI-NEXT: s_lshr_b32 s86, s14, 8 -; VI-NEXT: s_lshr_b32 s87, s41, 24 -; VI-NEXT: s_lshr_b32 s50, s41, 16 -; VI-NEXT: s_lshr_b32 s52, s41, 8 -; VI-NEXT: s_lshr_b32 s51, s40, 16 -; VI-NEXT: s_lshr_b32 s53, s40, 8 -; VI-NEXT: s_lshr_b32 s54, s43, 24 -; VI-NEXT: s_lshr_b32 s55, s43, 16 -; VI-NEXT: s_lshr_b32 s65, s43, 8 -; VI-NEXT: s_lshr_b32 s64, s42, 16 -; VI-NEXT: s_lshr_b32 s66, s42, 8 -; VI-NEXT: s_lshr_b32 s67, s45, 24 -; VI-NEXT: s_lshr_b32 s68, s45, 16 -; VI-NEXT: s_lshr_b32 s70, s45, 8 -; VI-NEXT: s_lshr_b32 s69, s44, 16 -; VI-NEXT: s_lshr_b32 s71, s44, 8 -; VI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[56:57], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[58:59], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[60:61], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[72:73], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[74:75], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[76:77], s[16:17], 24 -; VI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 -; VI-NEXT: s_lshr_b64 s[88:89], s[8:9], 24 -; VI-NEXT: s_lshr_b64 s[90:91], s[10:11], 24 -; VI-NEXT: s_lshr_b64 s[30:31], s[12:13], 24 -; VI-NEXT: s_lshr_b64 s[34:35], s[14:15], 24 -; VI-NEXT: s_lshr_b64 s[36:37], s[40:41], 24 -; VI-NEXT: s_lshr_b64 s[38:39], s[42:43], 24 -; VI-NEXT: s_lshr_b64 s[48:49], s[44:45], 24 +; VI-NEXT: s_lshr_b32 s6, s5, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 19 +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 21 +; VI-NEXT: s_lshr_b32 s6, s5, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 18 +; VI-NEXT: s_lshr_b32 s6, s4, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 23 +; VI-NEXT: s_lshr_b32 s6, s4, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 20 +; VI-NEXT: s_lshr_b32 s6, s29, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 25 +; VI-NEXT: s_lshr_b32 s6, s29, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 26 +; VI-NEXT: s_lshr_b32 s6, s29, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 22 +; VI-NEXT: s_lshr_b32 s6, s28, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 28 +; VI-NEXT: s_lshr_b32 s6, s28, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 24 +; VI-NEXT: s_lshr_b32 s6, s27, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 30 +; VI-NEXT: s_lshr_b32 s6, s27, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 31 +; VI-NEXT: s_lshr_b32 s6, s27, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 27 +; VI-NEXT: s_lshr_b32 s6, s26, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 34 +; VI-NEXT: s_lshr_b32 s6, s26, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 29 +; VI-NEXT: s_lshr_b32 s6, s25, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 36 +; VI-NEXT: s_lshr_b32 s6, s25, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 37 +; VI-NEXT: s_lshr_b32 s6, s25, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 32 +; VI-NEXT: s_lshr_b32 s6, s24, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 39 +; VI-NEXT: s_lshr_b32 s6, s24, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 33 +; VI-NEXT: s_lshr_b32 s6, s23, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 41 +; VI-NEXT: s_lshr_b32 s6, s23, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 42 +; VI-NEXT: s_lshr_b32 s6, s23, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 35 +; VI-NEXT: s_lshr_b32 s6, s22, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 44 +; VI-NEXT: s_lshr_b32 s6, s22, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 38 +; VI-NEXT: s_lshr_b32 s6, s21, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 46 +; VI-NEXT: s_lshr_b32 s6, s21, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 48 +; VI-NEXT: s_lshr_b32 s6, s21, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 40 +; VI-NEXT: s_lshr_b32 s6, s20, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 50 +; VI-NEXT: s_lshr_b32 s6, s20, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 43 +; VI-NEXT: s_lshr_b32 s6, s19, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 52 +; VI-NEXT: s_lshr_b32 s6, s19, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 53 +; VI-NEXT: s_lshr_b32 s6, s19, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 45 +; VI-NEXT: s_lshr_b32 s6, s18, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 54 +; VI-NEXT: s_lshr_b32 s6, s18, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 47 +; VI-NEXT: s_lshr_b32 s6, s17, 24 +; VI-NEXT: v_writelane_b32 v62, s6, 55 +; VI-NEXT: s_lshr_b32 s6, s17, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 56 +; VI-NEXT: s_lshr_b32 s6, s17, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 49 +; VI-NEXT: s_lshr_b32 s6, s16, 16 +; VI-NEXT: v_writelane_b32 v62, s6, 57 +; VI-NEXT: s_lshr_b32 s6, s16, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 51 +; VI-NEXT: s_lshr_b32 s6, s59, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 16 +; VI-NEXT: s_lshr_b32 s6, s61, 8 +; VI-NEXT: v_writelane_b32 v62, s6, 17 +; VI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 14 +; VI-NEXT: v_writelane_b32 v62, s79, 15 +; VI-NEXT: s_lshr_b64 s[78:79], s[28:29], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 12 +; VI-NEXT: v_writelane_b32 v62, s79, 13 +; VI-NEXT: s_lshr_b64 s[78:79], s[26:27], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 10 +; VI-NEXT: v_writelane_b32 v62, s79, 11 +; VI-NEXT: s_lshr_b64 s[78:79], s[24:25], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 8 +; VI-NEXT: v_writelane_b32 v62, s79, 9 +; VI-NEXT: s_lshr_b64 s[78:79], s[22:23], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 6 +; VI-NEXT: v_writelane_b32 v62, s79, 7 +; VI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 4 +; VI-NEXT: v_writelane_b32 v62, s79, 5 +; VI-NEXT: s_lshr_b64 s[78:79], s[18:19], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 2 +; VI-NEXT: v_writelane_b32 v62, s79, 3 +; VI-NEXT: s_lshr_b64 s[78:79], s[16:17], 24 +; VI-NEXT: v_writelane_b32 v62, s78, 0 +; VI-NEXT: s_lshr_b32 s50, s45, 24 +; VI-NEXT: s_lshr_b32 s47, s45, 16 +; VI-NEXT: s_lshr_b32 s53, s45, 8 +; VI-NEXT: s_lshr_b32 s51, s44, 16 +; VI-NEXT: s_lshr_b32 s9, s44, 8 +; VI-NEXT: s_lshr_b32 s43, s57, 24 +; VI-NEXT: s_lshr_b32 s42, s57, 16 +; VI-NEXT: s_lshr_b32 s52, s57, 8 +; VI-NEXT: s_lshr_b32 s46, s56, 16 +; VI-NEXT: s_lshr_b32 s8, s56, 8 +; VI-NEXT: s_lshr_b32 s40, s59, 24 +; VI-NEXT: s_lshr_b32 s15, s59, 16 +; VI-NEXT: s_lshr_b32 s41, s58, 16 +; VI-NEXT: s_lshr_b32 s65, s58, 8 +; VI-NEXT: s_lshr_b32 s83, s61, 24 +; VI-NEXT: s_lshr_b32 s82, s61, 16 +; VI-NEXT: s_lshr_b32 s14, s60, 16 +; VI-NEXT: s_lshr_b32 s64, s60, 8 +; VI-NEXT: s_lshr_b32 s13, s63, 24 +; VI-NEXT: s_lshr_b32 s12, s63, 16 +; VI-NEXT: s_lshr_b32 s87, s63, 8 +; VI-NEXT: s_lshr_b32 s81, s62, 16 +; VI-NEXT: s_lshr_b32 s55, s62, 8 +; VI-NEXT: s_lshr_b32 s71, s73, 24 +; VI-NEXT: s_lshr_b32 s70, s73, 16 +; VI-NEXT: s_lshr_b32 s86, s73, 8 +; VI-NEXT: s_lshr_b32 s80, s72, 16 +; VI-NEXT: s_lshr_b32 s54, s72, 8 +; VI-NEXT: s_lshr_b32 s10, s75, 24 +; VI-NEXT: s_lshr_b32 s69, s75, 16 +; VI-NEXT: s_lshr_b32 s85, s75, 8 +; VI-NEXT: s_lshr_b32 s11, s74, 16 +; VI-NEXT: s_lshr_b32 s7, s74, 8 +; VI-NEXT: s_lshr_b32 s67, s77, 24 +; VI-NEXT: s_lshr_b32 s66, s77, 16 +; VI-NEXT: s_lshr_b32 s84, s77, 8 +; VI-NEXT: s_lshr_b32 s68, s76, 16 +; VI-NEXT: s_lshr_b32 s6, s76, 8 +; VI-NEXT: v_writelane_b32 v62, s79, 1 +; VI-NEXT: s_lshr_b64 s[78:79], s[44:45], 24 +; VI-NEXT: s_lshr_b64 s[88:89], s[56:57], 24 +; VI-NEXT: s_lshr_b64 s[90:91], s[58:59], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[60:61], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[62:63], 24 +; VI-NEXT: s_lshr_b64 s[36:37], s[72:73], 24 +; VI-NEXT: s_lshr_b64 s[38:39], s[74:75], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[76:77], 24 ; VI-NEXT: s_cbranch_execnz .LBB91_4 ; VI-NEXT: .LBB91_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s46, s45, 16 -; VI-NEXT: v_mov_b32_e32 v31, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s46, v31 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s45, s45, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s45, v31 +; VI-NEXT: s_and_b32 s6, s77, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s77, 16 +; VI-NEXT: v_cndmask_b32_e32 v23, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_and_b32 s6, s76, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v9, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s76, 16 +; VI-NEXT: v_cndmask_b32_e32 v24, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_and_b32 s6, s75, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v8, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s75, 16 +; VI-NEXT: v_cndmask_b32_e32 v48, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_and_b32 s6, s74, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v11, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s74, 16 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s45, s44, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s45, v31 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_and_b32 s6, s73, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v10, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s44, s44, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s73, 16 +; VI-NEXT: v_cndmask_b32_e32 v54, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s44, s43, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s44, v31 +; VI-NEXT: s_and_b32 s6, s72, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v13, v3, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s72, 16 +; VI-NEXT: v_cndmask_b32_e32 v44, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s43, s43, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s43, v31 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s43, s42, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s43, v31 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; VI-NEXT: s_and_b32 s6, s63, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v12, v26, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s63, 16 +; VI-NEXT: v_cndmask_b32_e32 v59, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s42, s42, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s42, v31 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s42, s41, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s42, v31 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s41, s41, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s41, v31 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s41, s40, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s41, v31 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_and_b32 s40, s40, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s40, v31 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s40, s15, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s40, v31 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s15, s15, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s15, v31 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s15, s14, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s15, v31 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: s_and_b32 s14, s14, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s14, v31 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s14, s13, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s14, v31 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s13, s13, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s13, v31 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s13, s12, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s13, v31 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: s_and_b32 s12, s12, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s12, v31 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s12, s11, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s12, v31 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s11, s11, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s11, v31 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s11, s10, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s11, v31 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: s_and_b32 s10, s10, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s10, v31 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s10, s9, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s10, v31 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s9, s9, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_add_f32_e32 v14, s9, v31 -; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s9, s8, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s9, v31 -; VI-NEXT: v_bfe_u32 v15, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v13 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x7fff, v15 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: s_and_b32 s8, s8, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; VI-NEXT: v_add_f32_e32 v15, s8, v31 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; VI-NEXT: s_and_b32 s6, s62, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v7, v25, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s62, 16 +; VI-NEXT: v_cndmask_b32_e32 v61, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; VI-NEXT: s_and_b32 s6, s61, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v6, v28, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s61, 16 +; VI-NEXT: v_cndmask_b32_e32 v37, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; VI-NEXT: s_and_b32 s6, s60, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v15, v27, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s60, 16 +; VI-NEXT: v_cndmask_b32_e32 v36, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; VI-NEXT: s_and_b32 s6, s59, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v14, v30, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s59, 16 +; VI-NEXT: v_cndmask_b32_e32 v20, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; VI-NEXT: s_and_b32 s6, s58, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v5, v29, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s58, 16 +; VI-NEXT: v_cndmask_b32_e32 v21, v3, v4, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; VI-NEXT: s_and_b32 s6, s57, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, v32, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s57, 16 +; VI-NEXT: v_cndmask_b32_e32 v22, v3, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: s_lshl_b32 s8, s7, 16 -; VI-NEXT: v_alignbit_b32 v13, v15, v13, 16 -; VI-NEXT: v_add_f32_e32 v15, s8, v31 -; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; VI-NEXT: s_and_b32 s6, s56, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v17, v31, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s56, 16 +; VI-NEXT: v_cndmask_b32_e32 v51, v3, v16, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v16, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v3 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s7, s7, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s7, v31 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s7, s6, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v15, 16 -; VI-NEXT: v_add_f32_e32 v15, s7, v31 -; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: s_and_b32 s6, s6, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s6, v31 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v16, v18, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: s_and_b32 s6, s45, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v16, v34, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s45, 16 +; VI-NEXT: v_cndmask_b32_e32 v52, v3, v18, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v18, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v3 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_alignbit_b32 v15, v17, v15, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v52 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: s_and_b32 s6, s44, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v19, v33, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_lshl_b32 s6, s44, 16 +; VI-NEXT: v_cndmask_b32_e32 v40, v3, v18, vcc +; VI-NEXT: v_add_f32_e32 v3, s6, v1 +; VI-NEXT: v_bfe_u32 v18, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v3 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v18, v35, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 ; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_add_f32_e32 v18, s6, v31 -; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v18, v18, v17, 16 -; VI-NEXT: v_add_f32_e32 v17, s6, v31 -; VI-NEXT: v_bfe_u32 v19, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v17 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x7fff, v19 -; VI-NEXT: v_or_b32_e32 v20, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_or_b32_e32 v18, v35, v2 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s78, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s17, s6, 16 ; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v17, v19, v17, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x7fff, v20 -; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s45, s17, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s79, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s79, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s16, s6, 16 ; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc -; VI-NEXT: v_add_f32_e32 v20, s6, v31 -; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v19, 16 -; VI-NEXT: v_add_f32_e32 v19, s6, v31 -; VI-NEXT: v_bfe_u32 v21, v19, 16, 1 -; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v19 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x7fff, v21 -; VI-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v19, v21, v22, vcc -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v19, v21, v19, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x7fff, v22 -; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc -; VI-NEXT: v_add_f32_e32 v22, s6, v31 -; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v21, 16 -; VI-NEXT: v_add_f32_e32 v21, s6, v31 -; VI-NEXT: v_bfe_u32 v23, v21, 16, 1 -; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v21 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x7fff, v23 -; VI-NEXT: v_or_b32_e32 v24, 0x400000, v21 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc -; VI-NEXT: v_add_f32_e32 v23, s6, v31 -; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v21, v23, v21, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 -; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x7fff, v24 -; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc -; VI-NEXT: v_add_f32_e32 v24, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v23, 16 -; VI-NEXT: v_add_f32_e32 v23, s6, v31 -; VI-NEXT: v_bfe_u32 v25, v23, 16, 1 -; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v23 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x7fff, v25 -; VI-NEXT: v_or_b32_e32 v26, 0x400000, v23 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v23, v25, v26, vcc -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v23, v25, v23, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x7fff, v26 -; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc -; VI-NEXT: v_add_f32_e32 v26, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v25, 16 -; VI-NEXT: v_add_f32_e32 v25, s6, v31 -; VI-NEXT: v_bfe_u32 v27, v25, 16, 1 -; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v25 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x7fff, v27 -; VI-NEXT: v_or_b32_e32 v28, 0x400000, v25 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v25, v27, v28, vcc -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v25, v27, v25, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x7fff, v28 -; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc -; VI-NEXT: v_add_f32_e32 v28, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v27, 16 -; VI-NEXT: v_add_f32_e32 v27, s6, v31 -; VI-NEXT: v_bfe_u32 v29, v27, 16, 1 -; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v27 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x7fff, v29 -; VI-NEXT: v_or_b32_e32 v30, 0x400000, v27 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v27, v29, v30, vcc -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v27, v29, v27, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x7fff, v30 -; VI-NEXT: v_or_b32_e32 v32, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v30, v32, vcc -; VI-NEXT: v_add_f32_e32 v30, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v30, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v30 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; VI-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v29, 16 -; VI-NEXT: v_add_f32_e32 v29, s6, v31 -; VI-NEXT: v_bfe_u32 v32, v29, 16, 1 -; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v29 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x7fff, v32 -; VI-NEXT: v_or_b32_e32 v33, 0x400000, v29 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v29, v32, v33, vcc -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v29, v32, v29, 16 -; VI-NEXT: v_add_f32_e32 v32, s6, v31 -; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 -; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 -; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v32, v33, v32, 16 -; VI-NEXT: v_add_f32_e32 v33, s5, v31 -; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; VI-NEXT: v_add_f32_e32 v31, s4, v31 -; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc -; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 -; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 -; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v2 -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v13 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v7 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v13 -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v3 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v1 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s44, s16, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s9, s8 +; VI-NEXT: s_lshl_b32 s8, s19, 16 +; VI-NEXT: v_add_f32_e32 v2, s8, v1 +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_bfe_u32 s9, s8, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s8 +; VI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; VI-NEXT: s_add_i32 s10, s9, 0x7fff +; VI-NEXT: s_or_b32 s11, s8, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s11, s10 +; VI-NEXT: s_lshr_b32 s19, s8, 16 +; VI-NEXT: s_or_b32 s57, s19, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s7, v1 +; VI-NEXT: v_readfirstlane_b32 s7, v2 +; VI-NEXT: s_bfe_u32 s8, s7, 0x10010 +; VI-NEXT: s_add_i32 s8, s8, s7 +; VI-NEXT: s_add_i32 s10, s8, 0x7fff +; VI-NEXT: s_bitset1_b32 s7, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s7, s7, s10 +; VI-NEXT: s_lshl_b32 s8, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s8, v1 +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_bfe_u32 s9, s8, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s8 +; VI-NEXT: s_and_b32 s10, s7, 0xffff0000 +; VI-NEXT: s_add_i32 s11, s9, 0x7fff +; VI-NEXT: s_or_b32 s12, s8, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s12, s11 +; VI-NEXT: s_lshr_b32 s18, s8, 16 +; VI-NEXT: s_and_b32 s8, s21, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s8, v1 +; VI-NEXT: v_readfirstlane_b32 s8, v2 +; VI-NEXT: s_bfe_u32 s9, s8, 0x10010 +; VI-NEXT: s_add_i32 s9, s9, s8 +; VI-NEXT: s_or_b32 s56, s18, s10 +; VI-NEXT: s_add_i32 s10, s9, 0x7fff +; VI-NEXT: s_or_b32 s11, s8, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[8:9], vcc, exec +; VI-NEXT: s_cselect_b32 s8, s11, s10 +; VI-NEXT: s_lshl_b32 s10, s21, 16 +; VI-NEXT: v_add_f32_e32 v2, s10, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: s_bfe_u32 s11, s10, 0x10010 +; VI-NEXT: s_add_i32 s11, s11, s10 +; VI-NEXT: s_and_b32 s9, s8, 0xffff0000 +; VI-NEXT: s_add_i32 s12, s11, 0x7fff +; VI-NEXT: s_or_b32 s13, s10, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s13, s12 +; VI-NEXT: s_lshr_b32 s21, s10, 16 +; VI-NEXT: s_or_b32 s59, s21, s9 +; VI-NEXT: s_and_b32 s9, s20, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s9, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v2 +; VI-NEXT: s_bfe_u32 s10, s9, 0x10010 +; VI-NEXT: s_add_i32 s10, s10, s9 +; VI-NEXT: s_add_i32 s12, s10, 0x7fff +; VI-NEXT: s_bitset1_b32 s9, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s9, s9, s12 +; VI-NEXT: s_lshl_b32 s10, s20, 16 +; VI-NEXT: v_add_f32_e32 v2, s10, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: s_bfe_u32 s11, s10, 0x10010 +; VI-NEXT: s_add_i32 s11, s11, s10 +; VI-NEXT: s_and_b32 s12, s9, 0xffff0000 +; VI-NEXT: s_add_i32 s13, s11, 0x7fff +; VI-NEXT: s_or_b32 s14, s10, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s14, s13 +; VI-NEXT: s_lshr_b32 s20, s10, 16 +; VI-NEXT: s_and_b32 s10, s23, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s10, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: s_bfe_u32 s11, s10, 0x10010 +; VI-NEXT: s_add_i32 s11, s11, s10 +; VI-NEXT: s_or_b32 s58, s20, s12 +; VI-NEXT: s_add_i32 s12, s11, 0x7fff +; VI-NEXT: s_or_b32 s13, s10, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[10:11], vcc, exec +; VI-NEXT: s_cselect_b32 s10, s13, s12 +; VI-NEXT: s_lshl_b32 s12, s23, 16 +; VI-NEXT: v_add_f32_e32 v2, s12, v1 +; VI-NEXT: v_readfirstlane_b32 s12, v2 +; VI-NEXT: s_bfe_u32 s13, s12, 0x10010 +; VI-NEXT: s_add_i32 s13, s13, s12 +; VI-NEXT: s_and_b32 s11, s10, 0xffff0000 +; VI-NEXT: s_add_i32 s14, s13, 0x7fff +; VI-NEXT: s_or_b32 s15, s12, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s15, s14 +; VI-NEXT: s_lshr_b32 s23, s12, 16 +; VI-NEXT: s_or_b32 s61, s23, s11 +; VI-NEXT: s_and_b32 s11, s22, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s11, v1 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: s_bfe_u32 s12, s11, 0x10010 +; VI-NEXT: s_add_i32 s12, s12, s11 +; VI-NEXT: s_add_i32 s14, s12, 0x7fff +; VI-NEXT: s_bitset1_b32 s11, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s11, s11, s14 +; VI-NEXT: s_lshl_b32 s12, s22, 16 +; VI-NEXT: v_add_f32_e32 v2, s12, v1 +; VI-NEXT: v_readfirstlane_b32 s12, v2 +; VI-NEXT: s_bfe_u32 s13, s12, 0x10010 +; VI-NEXT: s_add_i32 s13, s13, s12 +; VI-NEXT: s_and_b32 s14, s11, 0xffff0000 +; VI-NEXT: s_add_i32 s15, s13, 0x7fff +; VI-NEXT: s_or_b32 s22, s12, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s22, s15 +; VI-NEXT: s_lshr_b32 s22, s12, 16 +; VI-NEXT: s_and_b32 s12, s25, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s12, v1 +; VI-NEXT: v_readfirstlane_b32 s12, v2 +; VI-NEXT: s_bfe_u32 s13, s12, 0x10010 +; VI-NEXT: s_add_i32 s13, s13, s12 +; VI-NEXT: s_or_b32 s60, s22, s14 +; VI-NEXT: s_add_i32 s14, s13, 0x7fff +; VI-NEXT: s_or_b32 s15, s12, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[12:13], vcc, exec +; VI-NEXT: s_cselect_b32 s12, s15, s14 +; VI-NEXT: s_lshl_b32 s14, s25, 16 +; VI-NEXT: v_add_f32_e32 v2, s14, v1 +; VI-NEXT: v_readfirstlane_b32 s14, v2 +; VI-NEXT: s_bfe_u32 s15, s14, 0x10010 +; VI-NEXT: s_add_i32 s15, s15, s14 +; VI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; VI-NEXT: s_add_i32 s25, s15, 0x7fff +; VI-NEXT: s_or_b32 s40, s14, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s40, s25 +; VI-NEXT: s_lshr_b32 s25, s14, 16 +; VI-NEXT: s_or_b32 s63, s25, s13 +; VI-NEXT: s_and_b32 s13, s24, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s13, v1 +; VI-NEXT: v_readfirstlane_b32 s13, v2 +; VI-NEXT: s_bfe_u32 s14, s13, 0x10010 +; VI-NEXT: s_add_i32 s14, s14, s13 +; VI-NEXT: s_add_i32 s40, s14, 0x7fff +; VI-NEXT: s_bitset1_b32 s13, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s13, s13, s40 +; VI-NEXT: s_lshl_b32 s14, s24, 16 +; VI-NEXT: v_add_f32_e32 v2, s14, v1 +; VI-NEXT: v_readfirstlane_b32 s14, v2 +; VI-NEXT: s_bfe_u32 s15, s14, 0x10010 +; VI-NEXT: s_add_i32 s15, s15, s14 +; VI-NEXT: s_and_b32 s40, s13, 0xffff0000 +; VI-NEXT: s_add_i32 s24, s15, 0x7fff +; VI-NEXT: s_or_b32 s41, s14, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s41, s24 +; VI-NEXT: s_lshr_b32 s24, s14, 16 +; VI-NEXT: s_and_b32 s14, s27, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s14, v1 +; VI-NEXT: v_readfirstlane_b32 s14, v2 +; VI-NEXT: s_bfe_u32 s15, s14, 0x10010 +; VI-NEXT: s_add_i32 s15, s15, s14 +; VI-NEXT: s_or_b32 s62, s24, s40 +; VI-NEXT: s_add_i32 s40, s15, 0x7fff +; VI-NEXT: s_or_b32 s41, s14, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[14:15], vcc, exec +; VI-NEXT: s_cselect_b32 s14, s41, s40 +; VI-NEXT: s_lshl_b32 s27, s27, 16 +; VI-NEXT: v_add_f32_e32 v2, s27, v1 +; VI-NEXT: v_readfirstlane_b32 s27, v2 +; VI-NEXT: s_bfe_u32 s40, s27, 0x10010 +; VI-NEXT: s_add_i32 s40, s40, s27 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v10 +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[10:11] +; VI-NEXT: s_and_b32 s15, s14, 0xffff0000 +; VI-NEXT: s_add_i32 s42, s40, 0x7fff +; VI-NEXT: s_bitset1_b32 s27, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v40 +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s27, s27, s42 +; VI-NEXT: s_lshr_b32 s27, s27, 16 +; VI-NEXT: s_or_b32 s73, s27, s15 +; VI-NEXT: s_and_b32 s15, s26, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s15, v1 +; VI-NEXT: v_readfirstlane_b32 s15, v2 +; VI-NEXT: s_bfe_u32 s40, s15, 0x10010 +; VI-NEXT: s_add_i32 s40, s40, s15 +; VI-NEXT: s_add_i32 s42, s40, 0x7fff +; VI-NEXT: s_bitset1_b32 s15, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s15, s15, s42 +; VI-NEXT: s_lshl_b32 s26, s26, 16 +; VI-NEXT: v_add_f32_e32 v2, s26, v1 +; VI-NEXT: v_readfirstlane_b32 s26, v2 +; VI-NEXT: s_bfe_u32 s40, s26, 0x10010 +; VI-NEXT: s_add_i32 s40, s40, s26 +; VI-NEXT: s_and_b32 s42, s15, 0xffff0000 +; VI-NEXT: s_add_i32 s43, s40, 0x7fff +; VI-NEXT: s_bitset1_b32 s26, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s26, s26, s43 +; VI-NEXT: s_and_b32 s40, s29, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s40, v1 +; VI-NEXT: v_readfirstlane_b32 s40, v2 +; VI-NEXT: s_bfe_u32 s41, s40, 0x10010 +; VI-NEXT: s_lshr_b32 s26, s26, 16 +; VI-NEXT: s_add_i32 s41, s41, s40 +; VI-NEXT: s_or_b32 s72, s26, s42 +; VI-NEXT: s_add_i32 s42, s41, 0x7fff +; VI-NEXT: s_or_b32 s43, s40, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[40:41], vcc, exec +; VI-NEXT: s_cselect_b32 s40, s43, s42 +; VI-NEXT: s_lshl_b32 s29, s29, 16 +; VI-NEXT: v_add_f32_e32 v2, s29, v1 +; VI-NEXT: v_readfirstlane_b32 s29, v2 +; VI-NEXT: s_bfe_u32 s42, s29, 0x10010 +; VI-NEXT: s_add_i32 s42, s42, s29 +; VI-NEXT: s_and_b32 s41, s40, 0xffff0000 +; VI-NEXT: s_add_i32 s46, s42, 0x7fff +; VI-NEXT: s_bitset1_b32 s29, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s29, s29, s46 +; VI-NEXT: s_lshr_b32 s29, s29, 16 +; VI-NEXT: s_or_b32 s75, s29, s41 +; VI-NEXT: s_and_b32 s41, s28, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s41, v1 +; VI-NEXT: v_readfirstlane_b32 s41, v2 +; VI-NEXT: s_bfe_u32 s42, s41, 0x10010 +; VI-NEXT: s_add_i32 s42, s42, s41 +; VI-NEXT: s_add_i32 s46, s42, 0x7fff +; VI-NEXT: s_bitset1_b32 s41, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s41, s41, s46 +; VI-NEXT: s_lshl_b32 s28, s28, 16 +; VI-NEXT: v_add_f32_e32 v2, s28, v1 +; VI-NEXT: v_readfirstlane_b32 s28, v2 +; VI-NEXT: s_bfe_u32 s42, s28, 0x10010 +; VI-NEXT: s_add_i32 s42, s42, s28 +; VI-NEXT: s_and_b32 s46, s41, 0xffff0000 +; VI-NEXT: s_add_i32 s47, s42, 0x7fff +; VI-NEXT: s_bitset1_b32 s28, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s28, s28, s47 +; VI-NEXT: s_and_b32 s42, s5, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s42, v1 +; VI-NEXT: v_readfirstlane_b32 s42, v2 +; VI-NEXT: s_bfe_u32 s43, s42, 0x10010 +; VI-NEXT: s_lshr_b32 s28, s28, 16 +; VI-NEXT: s_add_i32 s43, s43, s42 +; VI-NEXT: s_or_b32 s74, s28, s46 +; VI-NEXT: s_add_i32 s46, s43, 0x7fff +; VI-NEXT: s_or_b32 s47, s42, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s46, s47, s46 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s42, s5, 0x10010 +; VI-NEXT: s_add_i32 s42, s42, s5 +; VI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; VI-NEXT: s_add_i32 s76, s42, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s76 +; VI-NEXT: s_and_b32 s42, s4, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s42, v1 +; VI-NEXT: v_readfirstlane_b32 s42, v2 +; VI-NEXT: s_bfe_u32 s43, s42, 0x10010 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s43, s43, s42 +; VI-NEXT: s_or_b32 s77, s5, s47 +; VI-NEXT: s_add_i32 s47, s43, 0x7fff +; VI-NEXT: s_or_b32 s76, s42, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: s_cselect_b32 s47, s76, s47 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_bfe_u32 s42, s4, 0x10010 +; VI-NEXT: s_add_i32 s42, s42, s4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[18:19] +; VI-NEXT: s_and_b32 s76, s47, 0xffff0000 +; VI-NEXT: s_add_i32 s88, s42, 0x7fff +; VI-NEXT: s_bitset1_b32 s4, 22 +; VI-NEXT: v_lshrrev_b64 v[2:3], 24, v[16:17] +; VI-NEXT: s_and_b64 s[42:43], vcc, exec +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[4:5] +; VI-NEXT: s_cselect_b32 s4, s4, s88 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[14:15] +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[6:7] +; VI-NEXT: s_or_b32 s76, s4, s76 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[6:7], 24, v[12:13] +; VI-NEXT: s_lshr_b64 s[88:89], s[76:77], 24 +; VI-NEXT: s_lshr_b64 s[30:31], s[74:75], 24 +; VI-NEXT: s_lshr_b64 s[34:35], s[72:73], 24 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[8:9] +; VI-NEXT: s_lshr_b64 s[38:39], s[62:63], 24 +; VI-NEXT: s_lshr_b64 s[48:49], s[60:61], 24 +; VI-NEXT: s_lshr_b64 s[50:51], s[58:59], 24 +; VI-NEXT: s_lshr_b64 s[52:53], s[56:57], 24 +; VI-NEXT: s_lshr_b64 s[54:55], s[44:45], 24 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 +; VI-NEXT: s_lshr_b32 s42, s77, 8 +; VI-NEXT: s_lshr_b32 s76, s76, 8 +; VI-NEXT: s_lshr_b32 s75, s75, 8 +; VI-NEXT: s_lshr_b32 s74, s74, 8 +; VI-NEXT: s_lshr_b32 s73, s73, 8 +; VI-NEXT: s_lshr_b32 s72, s72, 8 +; VI-NEXT: s_lshr_b32 s63, s63, 8 +; VI-NEXT: s_lshr_b32 s62, s62, 8 +; VI-NEXT: s_lshr_b32 s61, s61, 8 +; VI-NEXT: s_lshr_b32 s60, s60, 8 +; VI-NEXT: s_lshr_b32 s59, s59, 8 +; VI-NEXT: s_lshr_b32 s58, s58, 8 +; VI-NEXT: s_lshr_b32 s77, s57, 8 +; VI-NEXT: s_lshr_b32 s90, s56, 8 +; VI-NEXT: s_lshr_b32 vcc_lo, s45, 8 +; VI-NEXT: s_lshr_b32 vcc_hi, s44, 8 +; VI-NEXT: s_lshr_b32 s43, s46, 24 +; VI-NEXT: s_lshr_b32 s44, s46, 16 +; VI-NEXT: s_lshr_b32 s45, s47, 16 +; VI-NEXT: s_lshr_b32 s46, s40, 24 +; VI-NEXT: s_lshr_b32 s40, s40, 16 +; VI-NEXT: s_lshr_b32 s41, s41, 16 +; VI-NEXT: s_lshr_b32 s47, s14, 24 +; VI-NEXT: s_lshr_b32 s14, s14, 16 +; VI-NEXT: s_lshr_b32 s15, s15, 16 +; VI-NEXT: s_lshr_b32 s56, s12, 24 +; VI-NEXT: s_lshr_b32 s12, s12, 16 +; VI-NEXT: s_lshr_b32 s13, s13, 16 +; VI-NEXT: s_lshr_b32 s57, s10, 24 +; VI-NEXT: s_lshr_b32 s10, s10, 16 +; VI-NEXT: s_lshr_b32 s11, s11, 16 +; VI-NEXT: s_lshr_b32 s89, s8, 24 +; VI-NEXT: s_lshr_b32 s91, s8, 16 +; VI-NEXT: s_lshr_b32 s9, s9, 16 +; VI-NEXT: s_lshr_b32 s31, s6, 24 +; VI-NEXT: s_lshr_b32 s35, s6, 16 +; VI-NEXT: s_lshr_b32 s36, s7, 16 +; VI-NEXT: s_lshr_b32 s37, s78, 24 +; VI-NEXT: s_lshr_b32 s78, s78, 16 +; VI-NEXT: s_lshr_b32 s8, s79, 16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v52 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v52 +; VI-NEXT: v_lshrrev_b32_e32 v12, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v51 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v37 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v59 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v61 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v54 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v48 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v20, 24, v23 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v24 ; VI-NEXT: s_branch .LBB91_5 ; VI-NEXT: .LBB91_3: -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr71 -; VI-NEXT: ; implicit-def: $sgpr69 -; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; kill: killed $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 0 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 1 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr68 -; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr84 ; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr7 +; VI-NEXT: ; implicit-def: $sgpr11 +; VI-NEXT: ; implicit-def: $sgpr85 +; VI-NEXT: ; implicit-def: $sgpr69 +; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr80 +; VI-NEXT: ; implicit-def: $sgpr86 +; VI-NEXT: ; implicit-def: $sgpr70 +; VI-NEXT: ; implicit-def: $sgpr71 +; VI-NEXT: ; implicit-def: $sgpr55 +; VI-NEXT: ; implicit-def: $sgpr81 +; VI-NEXT: ; implicit-def: $sgpr87 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr13 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr82 +; VI-NEXT: ; implicit-def: $sgpr83 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr55 -; VI-NEXT: ; implicit-def: $sgpr54 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; kill: killed $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 2 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 3 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; implicit-def: $sgpr52 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr43 +; VI-NEXT: ; implicit-def: $sgpr9 +; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr50 -; VI-NEXT: ; implicit-def: $sgpr87 -; VI-NEXT: ; implicit-def: $sgpr86 -; VI-NEXT: ; implicit-def: $sgpr84 -; VI-NEXT: ; implicit-def: $sgpr85 -; VI-NEXT: ; implicit-def: $sgpr83 -; VI-NEXT: ; implicit-def: $sgpr82 -; VI-NEXT: ; implicit-def: $sgpr81 -; VI-NEXT: ; implicit-def: $sgpr80 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr36 @@ -165349,406 +166747,433 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr90 ; VI-NEXT: ; implicit-def: $sgpr88 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 4 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 5 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 7 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 8 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 9 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 10 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 11 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 12 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 13 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s78, 14 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: v_writelane_b32 v62, s79, 15 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr78 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; kill: killed $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: s_branch .LBB91_2 ; VI-NEXT: .LBB91_4: -; VI-NEXT: v_mov_b32_e32 v33, s71 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s69 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s70 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s68 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s67 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s86 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s83 -; VI-NEXT: v_mov_b32_e32 v31, s4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s82 -; VI-NEXT: v_readlane_b32 s4, v62, 0 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 1 -; VI-NEXT: v_mov_b32_e32 v40, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 2 -; VI-NEXT: v_mov_b32_e32 v44, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 3 -; VI-NEXT: v_mov_b32_e32 v54, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 4 -; VI-NEXT: v_mov_b32_e32 v53, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 5 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 6 -; VI-NEXT: v_mov_b32_e32 v51, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 7 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 8 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 9 -; VI-NEXT: v_mov_b32_e32 v56, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 10 -; VI-NEXT: v_mov_b32_e32 v47, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 11 -; VI-NEXT: v_mov_b32_e32 v48, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 12 -; VI-NEXT: v_mov_b32_e32 v43, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 13 -; VI-NEXT: v_mov_b32_e32 v46, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 14 -; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 15 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 16 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 19 -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 20 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 21 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 22 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 23 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 24 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 27 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 39 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 40 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 41 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 50 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 51 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 52 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: v_mov_b32_e32 v42, s54 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_mov_b32_e32 v41, s46 -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s56 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s58 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s60 -; VI-NEXT: v_mov_b32_e32 v45, s72 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s74 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s76 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s78 -; VI-NEXT: v_mov_b32_e32 v55, s88 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v36, s66 -; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v55, v50 -; VI-NEXT: v_mov_b32_e32 v35, s30 -; VI-NEXT: v_mov_b32_e32 v59, s87 -; VI-NEXT: v_mov_b32_e32 v58, s34 -; VI-NEXT: v_mov_b32_e32 v45, s36 -; VI-NEXT: v_mov_b32_e32 v34, s38 -; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: v_mov_b32_e32 v2, s45 -; VI-NEXT: v_mov_b32_e32 v3, s42 -; VI-NEXT: v_mov_b32_e32 v4, s43 -; VI-NEXT: v_mov_b32_e32 v5, s40 -; VI-NEXT: v_mov_b32_e32 v6, s41 -; VI-NEXT: v_mov_b32_e32 v7, s14 -; VI-NEXT: v_mov_b32_e32 v8, s15 -; VI-NEXT: v_mov_b32_e32 v9, s12 -; VI-NEXT: v_mov_b32_e32 v10, s13 -; VI-NEXT: v_mov_b32_e32 v11, s10 -; VI-NEXT: v_mov_b32_e32 v12, s11 -; VI-NEXT: v_mov_b32_e32 v13, s8 -; VI-NEXT: v_mov_b32_e32 v14, s9 -; VI-NEXT: v_mov_b32_e32 v15, s6 -; VI-NEXT: v_mov_b32_e32 v16, s7 -; VI-NEXT: v_mov_b32_e32 v17, s16 -; VI-NEXT: v_mov_b32_e32 v18, s17 -; VI-NEXT: v_mov_b32_e32 v19, s18 -; VI-NEXT: v_mov_b32_e32 v20, s19 -; VI-NEXT: v_mov_b32_e32 v21, s20 -; VI-NEXT: v_mov_b32_e32 v22, s21 -; VI-NEXT: v_mov_b32_e32 v23, s22 -; VI-NEXT: v_mov_b32_e32 v24, s23 -; VI-NEXT: v_mov_b32_e32 v25, s24 -; VI-NEXT: v_mov_b32_e32 v26, s25 -; VI-NEXT: v_mov_b32_e32 v27, s26 -; VI-NEXT: v_mov_b32_e32 v28, s27 -; VI-NEXT: v_mov_b32_e32 v29, s28 -; VI-NEXT: v_mov_b32_e32 v30, s29 -; VI-NEXT: v_mov_b32_e32 v32, s5 -; VI-NEXT: v_mov_b32_e32 v41, s62 -; VI-NEXT: v_mov_b32_e32 v57, s81 -; VI-NEXT: v_mov_b32_e32 v37, s84 -; VI-NEXT: v_mov_b32_e32 v60, s52 -; VI-NEXT: v_mov_b32_e32 v38, s51 -; VI-NEXT: v_mov_b32_e32 v61, s65 -; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v39, s55 -; VI-NEXT: v_mov_b32_e32 v50, v46 -; VI-NEXT: v_mov_b32_e32 v46, v48 -; VI-NEXT: v_mov_b32_e32 v48, v47 -; VI-NEXT: v_mov_b32_e32 v47, v56 -; VI-NEXT: v_mov_b32_e32 v56, v51 -; VI-NEXT: v_mov_b32_e32 v51, s90 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s85 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, s48 -; VI-NEXT: v_mov_b32_e32 v51, v53 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: v_mov_b32_e32 v54, v40 -; VI-NEXT: v_mov_b32_e32 v40, s80 -; VI-NEXT: v_mov_b32_e32 v58, s50 -; VI-NEXT: v_mov_b32_e32 v45, s53 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s73 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s74 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s75 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s76 +; VI-NEXT: v_mov_b32_e32 v16, s8 +; VI-NEXT: v_readlane_b32 s8, v62, 16 +; VI-NEXT: v_mov_b32_e32 v2, s88 +; VI-NEXT: v_mov_b32_e32 v4, s30 +; VI-NEXT: v_mov_b32_e32 v5, s34 +; VI-NEXT: v_readlane_b32 s88, v62, 14 +; VI-NEXT: v_readlane_b32 s30, v62, 12 +; VI-NEXT: v_readlane_b32 s34, v62, 10 +; VI-NEXT: v_mov_b32_e32 v11, s51 +; VI-NEXT: v_mov_b32_e32 v8, s50 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s77 +; VI-NEXT: v_mov_b32_e32 v19, s53 +; VI-NEXT: v_mov_b32_e32 v17, s52 +; VI-NEXT: v_mov_b32_e32 v38, s8 +; VI-NEXT: v_readlane_b32 s8, v62, 17 +; VI-NEXT: v_mov_b32_e32 v15, s55 +; VI-NEXT: v_mov_b32_e32 v55, s54 +; VI-NEXT: v_mov_b32_e32 v10, s38 +; VI-NEXT: v_mov_b32_e32 v7, s48 +; VI-NEXT: v_readlane_b32 s89, v62, 15 +; VI-NEXT: v_readlane_b32 s31, v62, 13 +; VI-NEXT: v_readlane_b32 s35, v62, 11 +; VI-NEXT: v_readlane_b32 s38, v62, 8 +; VI-NEXT: v_readlane_b32 s48, v62, 6 +; VI-NEXT: v_readlane_b32 s50, v62, 4 +; VI-NEXT: v_readlane_b32 s52, v62, 2 +; VI-NEXT: v_readlane_b32 s54, v62, 0 +; VI-NEXT: v_mov_b32_e32 v35, s44 +; VI-NEXT: v_mov_b32_e32 v33, s45 +; VI-NEXT: v_mov_b32_e32 v9, s47 +; VI-NEXT: v_mov_b32_e32 v34, s56 +; VI-NEXT: v_mov_b32_e32 v52, s46 +; VI-NEXT: v_mov_b32_e32 v31, s57 +; VI-NEXT: v_mov_b32_e32 v12, s43 +; VI-NEXT: v_mov_b32_e32 v13, s42 +; VI-NEXT: v_mov_b32_e32 v32, s58 +; VI-NEXT: v_mov_b32_e32 v42, s41 +; VI-NEXT: v_mov_b32_e32 v29, s59 +; VI-NEXT: v_mov_b32_e32 v40, s40 +; VI-NEXT: v_mov_b32_e32 v41, s15 +; VI-NEXT: v_mov_b32_e32 v30, s60 +; VI-NEXT: v_mov_b32_e32 v57, s14 +; VI-NEXT: v_mov_b32_e32 v27, s61 +; VI-NEXT: v_mov_b32_e32 v46, s83 +; VI-NEXT: v_mov_b32_e32 v47, s82 +; VI-NEXT: v_mov_b32_e32 v28, s62 +; VI-NEXT: v_mov_b32_e32 v61, s81 +; VI-NEXT: v_mov_b32_e32 v25, s63 +; VI-NEXT: v_mov_b32_e32 v60, s13 +; VI-NEXT: v_mov_b32_e32 v59, s12 +; VI-NEXT: v_mov_b32_e32 v26, s72 +; VI-NEXT: v_mov_b32_e32 v44, s80 +; VI-NEXT: v_mov_b32_e32 v37, s71 +; VI-NEXT: v_mov_b32_e32 v54, s70 +; VI-NEXT: v_mov_b32_e32 v36, s11 +; VI-NEXT: v_mov_b32_e32 v51, s10 +; VI-NEXT: v_mov_b32_e32 v48, s69 +; VI-NEXT: v_mov_b32_e32 v22, s68 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v20, s67 +; VI-NEXT: v_mov_b32_e32 v21, s66 +; VI-NEXT: v_mov_b32_e32 v18, s9 +; VI-NEXT: v_mov_b32_e32 v39, s65 +; VI-NEXT: v_mov_b32_e32 v50, s64 +; VI-NEXT: v_mov_b32_e32 v49, s8 +; VI-NEXT: v_mov_b32_e32 v14, s87 +; VI-NEXT: v_mov_b32_e32 v53, s86 +; VI-NEXT: v_mov_b32_e32 v45, s7 +; VI-NEXT: v_mov_b32_e32 v43, s85 +; VI-NEXT: v_mov_b32_e32 v58, s6 +; VI-NEXT: v_mov_b32_e32 v56, s84 +; VI-NEXT: v_mov_b32_e32 v1, s78 +; VI-NEXT: v_mov_b32_e32 v3, s90 +; VI-NEXT: v_mov_b32_e32 v6, s36 +; VI-NEXT: v_readlane_b32 s42, v62, 18 +; VI-NEXT: v_readlane_b32 s43, v62, 19 +; VI-NEXT: v_readlane_b32 s76, v62, 20 +; VI-NEXT: v_readlane_b32 s44, v62, 21 +; VI-NEXT: v_readlane_b32 s75, v62, 22 +; VI-NEXT: v_readlane_b32 s45, v62, 23 +; VI-NEXT: v_readlane_b32 s74, v62, 24 +; VI-NEXT: v_readlane_b32 s46, v62, 25 +; VI-NEXT: v_readlane_b32 s40, v62, 26 +; VI-NEXT: v_readlane_b32 s73, v62, 27 +; VI-NEXT: v_readlane_b32 s41, v62, 28 +; VI-NEXT: v_readlane_b32 s72, v62, 29 +; VI-NEXT: v_readlane_b32 s47, v62, 30 +; VI-NEXT: v_readlane_b32 s14, v62, 31 +; VI-NEXT: v_readlane_b32 s63, v62, 32 +; VI-NEXT: v_readlane_b32 s62, v62, 33 +; VI-NEXT: v_readlane_b32 s15, v62, 34 +; VI-NEXT: v_readlane_b32 s61, v62, 35 +; VI-NEXT: v_readlane_b32 s56, v62, 36 +; VI-NEXT: v_readlane_b32 s12, v62, 37 +; VI-NEXT: v_readlane_b32 s60, v62, 38 +; VI-NEXT: v_readlane_b32 s13, v62, 39 +; VI-NEXT: v_readlane_b32 s59, v62, 40 +; VI-NEXT: v_readlane_b32 s57, v62, 41 +; VI-NEXT: v_readlane_b32 s10, v62, 42 +; VI-NEXT: v_readlane_b32 s58, v62, 43 +; VI-NEXT: v_readlane_b32 s11, v62, 44 +; VI-NEXT: v_readlane_b32 s77, v62, 45 +; VI-NEXT: v_readlane_b32 s89, v62, 46 +; VI-NEXT: v_readlane_b32 s90, v62, 47 +; VI-NEXT: v_readlane_b32 s91, v62, 48 +; VI-NEXT: v_readlane_b32 vcc_lo, v62, 49 +; VI-NEXT: v_readlane_b32 s9, v62, 50 +; VI-NEXT: v_readlane_b32 vcc_hi, v62, 51 +; VI-NEXT: v_readlane_b32 s31, v62, 52 +; VI-NEXT: v_readlane_b32 s35, v62, 53 +; VI-NEXT: v_readlane_b32 s36, v62, 54 +; VI-NEXT: v_readlane_b32 s39, v62, 9 +; VI-NEXT: v_readlane_b32 s37, v62, 55 +; VI-NEXT: v_readlane_b32 s78, v62, 56 +; VI-NEXT: v_readlane_b32 s8, v62, 57 +; VI-NEXT: v_readlane_b32 s49, v62, 7 +; VI-NEXT: v_readlane_b32 s51, v62, 5 +; VI-NEXT: v_readlane_b32 s53, v62, 3 +; VI-NEXT: v_readlane_b32 s55, v62, 1 ; VI-NEXT: .LBB91_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s6, s16, 0xff +; VI-NEXT: s_lshl_b32 s7, vcc_hi, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s8, 0xff +; VI-NEXT: s_lshl_b32 s8, s54, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v23, s6 +; VI-NEXT: s_and_b32 s6, s17, 0xff +; VI-NEXT: s_lshl_b32 s7, vcc_lo, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s78, 0xff +; VI-NEXT: s_lshl_b32 s8, s37, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s18, 0xff +; VI-NEXT: s_lshl_b32 s7, s90, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s36, 0xff +; VI-NEXT: s_lshl_b32 s8, s52, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v23, vcc, 4, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s19, 0xff +; VI-NEXT: s_lshl_b32 s7, s77, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s35, 0xff +; VI-NEXT: s_lshl_b32 s8, s31, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 8, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s20, 0xff +; VI-NEXT: s_lshl_b32 s7, s58, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s9, 0xff +; VI-NEXT: s_lshl_b32 s8, s50, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 12, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s21, 0xff +; VI-NEXT: s_lshl_b32 s7, s59, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s91, 0xff +; VI-NEXT: s_lshl_b32 s8, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 16, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s60, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s11, 0xff +; VI-NEXT: s_lshl_b32 s8, s48, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 20, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s23, 0xff +; VI-NEXT: s_lshl_b32 s7, s61, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s10, 0xff +; VI-NEXT: s_lshl_b32 s8, s57, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 24, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s24, 0xff +; VI-NEXT: s_lshl_b32 s7, s62, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s13, 0xff +; VI-NEXT: s_lshl_b32 s8, s38, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 28, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s25, 0xff +; VI-NEXT: s_lshl_b32 s7, s63, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s12, 0xff +; VI-NEXT: s_lshl_b32 s8, s56, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 32, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s26, 0xff +; VI-NEXT: s_lshl_b32 s7, s72, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s15, 0xff +; VI-NEXT: s_lshl_b32 s8, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 36, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s27, 0xff +; VI-NEXT: s_lshl_b32 s7, s73, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s14, 0xff +; VI-NEXT: s_lshl_b32 s8, s47, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 40, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s28, 0xff +; VI-NEXT: s_lshl_b32 s7, s74, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s41, 0xff +; VI-NEXT: s_lshl_b32 s8, s30, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 44, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s6, s29, 0xff +; VI-NEXT: s_lshl_b32 s7, s75, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s7, s40, 0xff +; VI-NEXT: s_lshl_b32 s8, s46, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s6, s6, 0xffff +; VI-NEXT: s_lshl_b32 s7, s7, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 48, v0 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s6, s76, 8 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_and_b32 s6, s45, 0xff +; VI-NEXT: s_lshl_b32 s7, s88, 8 +; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v24, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s42, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s44, 0xff +; VI-NEXT: s_lshl_b32 s6, s43, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v23, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v23, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v24, s4 +; VI-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v58 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; VI-NEXT: v_or_b32_sdwa v7, v22, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v22, vcc, 64, v0 +; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; VI-NEXT: v_or_b32_sdwa v10, v36, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; VI-NEXT: v_or_b32_sdwa v6, v44, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; VI-NEXT: v_or_b32_sdwa v5, v61, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v4, v57, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -165782,374 +167207,105 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v18, v18, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v33 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v20, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v21, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v22, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v23, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v24, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v25, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v26, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v27, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v28, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v29, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v30, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v18, v31, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v17, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v18, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v17, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v22, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v7, v20, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v20, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v51 +; VI-NEXT: v_or_b32_sdwa v10, v48, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v55 +; VI-NEXT: v_or_b32_sdwa v7, v26, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v37 +; VI-NEXT: v_or_b32_sdwa v7, v54, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v15 +; VI-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v60 +; VI-NEXT: v_or_b32_sdwa v5, v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v59, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; VI-NEXT: v_or_b32_sdwa v5, v30, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v46 +; VI-NEXT: v_or_b32_sdwa v4, v27, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v47, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 +; VI-NEXT: v_or_b32_sdwa v4, v32, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v38 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v40 +; VI-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v60 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v59 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -166166,8 +167322,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -216125,74 +217281,74 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v63, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v0, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v0, v61, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v0, v60, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v0, v59, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v0, v58, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v0, v57, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v0, v56, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v0, v47, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v0, v46, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v0, v45, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v0, v44, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v0, v43, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v0, v42, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v16, v55, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v16, v54, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v16, v53, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v16, v52, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v16, v51, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v16, v50, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v16, v49, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v16, v48, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v16, v39, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v16, v38, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v16, v37, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v16, v36, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v16, v35, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v16, v34, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 -; VI-NEXT: v_alignbit_b32 v1, v0, v41, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v16, v33, 16 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; VI-NEXT: v_or_b32_sdwa v13, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; VI-NEXT: v_or_b32_sdwa v12, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; VI-NEXT: v_or_b32_sdwa v11, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; VI-NEXT: v_or_b32_sdwa v10, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; VI-NEXT: v_or_b32_sdwa v9, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; VI-NEXT: v_or_b32_sdwa v8, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; VI-NEXT: v_or_b32_sdwa v7, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; VI-NEXT: v_or_b32_sdwa v6, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; VI-NEXT: v_or_b32_sdwa v5, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v3, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; VI-NEXT: v_or_b32_sdwa v2, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31 +; VI-NEXT: v_or_b32_sdwa v31, v55, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 +; VI-NEXT: v_or_b32_sdwa v30, v54, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; VI-NEXT: v_or_b32_sdwa v29, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 +; VI-NEXT: v_or_b32_sdwa v28, v52, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 +; VI-NEXT: v_or_b32_sdwa v27, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 +; VI-NEXT: v_or_b32_sdwa v26, v50, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v25 +; VI-NEXT: v_or_b32_sdwa v25, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24 +; VI-NEXT: v_or_b32_sdwa v24, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v23 +; VI-NEXT: v_or_b32_sdwa v23, v39, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v22 +; VI-NEXT: v_or_b32_sdwa v22, v38, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21 +; VI-NEXT: v_or_b32_sdwa v21, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 +; VI-NEXT: v_or_b32_sdwa v20, v36, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v19 +; VI-NEXT: v_or_b32_sdwa v19, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v18, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v1, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; VI-NEXT: v_or_b32_sdwa v17, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_alignbit_b32 v0, v0, v40, 16 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB100_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -219070,19 +220226,19 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s29, 16 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v14, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 +; VI-NEXT: v_or_b32_sdwa v15, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 @@ -219094,9 +220250,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v13, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219112,9 +220268,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v12, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219130,9 +220286,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v11, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219148,9 +220304,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219166,9 +220322,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219184,9 +220340,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v8, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219202,9 +220358,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219220,9 +220376,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219238,9 +220394,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219256,9 +220412,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219274,9 +220430,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -219292,9 +220448,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 @@ -219310,9 +220466,9 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v18, s4, v0 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 @@ -219328,8 +220484,8 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 @@ -219586,38 +220742,38 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_sdwa v31, v55, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v53, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v52, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v51, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v50, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v49, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v48, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v39, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v36, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v35, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB101_5 ; VI-NEXT: .LBB101_3: ; VI-NEXT: s_branch .LBB101_2 @@ -221085,22 +222241,141 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 @@ -221119,21 +222394,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 @@ -221147,21 +222407,37 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -221180,407 +222456,286 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v56 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v17 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 @@ -221646,35 +222801,30 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -221701,59 +222851,62 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -221766,10 +222919,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill @@ -221782,10 +222935,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill @@ -221793,23 +222946,25 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill @@ -221822,10 +222977,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill @@ -221838,10 +222993,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill @@ -221854,10 +223009,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -221870,10 +223025,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill @@ -221886,10 +223041,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill @@ -221902,10 +223057,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill @@ -221918,10 +223073,10 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill @@ -221933,388 +223088,414 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v58 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v3, v24 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v7 ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -222594,629 +223775,499 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v46, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:76 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v51, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s24 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v38 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v49 +; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v48 -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v36 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB103_2 +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB103_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v35 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v52 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mov_b32_e32 v50, v19 -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v27, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v48 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v50 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; SI-NEXT: v_mov_b32_e32 v32, v19 +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: v_mov_b32_e32 v28, v22 +; SI-NEXT: v_mov_b32_e32 v39, v54 +; SI-NEXT: v_mov_b32_e32 v36, v43 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v43 +; SI-NEXT: v_mov_b32_e32 v43, v7 +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_mov_b32_e32 v57, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_mov_b32_e32 v33, v12 -; SI-NEXT: v_mov_b32_e32 v34, v5 -; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44 -; SI-NEXT: v_mov_b32_e32 v44, v18 -; SI-NEXT: v_mov_b32_e32 v5, v43 -; SI-NEXT: v_mov_b32_e32 v18, v6 -; SI-NEXT: s_branch .LBB103_3 -; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v50, v19 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: v_mov_b32_e32 v5, v6 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v9 -; SI-NEXT: v_mov_b32_e32 v12, v31 -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v31, v11 -; SI-NEXT: v_mov_b32_e32 v9, v17 -; SI-NEXT: s_cbranch_vccnz .LBB103_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v2 +; SI-NEXT: v_mov_b32_e32 v44, v52 +; SI-NEXT: s_cbranch_execnz .LBB103_3 +; SI-NEXT: .LBB103_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v33 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v25 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v32 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v9 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v53 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v47 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v46 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v40 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v39 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v20 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v52, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -223232,482 +224283,531 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v49 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v60 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v56 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v46 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v45 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v51 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v51, v20 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v9 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_mov_b32_e32 v16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v26 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_mov_b32_e32 v6, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v15 +; SI-NEXT: v_mov_b32_e32 v15, v20 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v27 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v3 ; SI-NEXT: v_mov_b32_e32 v3, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: .LBB103_5: ; %end -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; SI-NEXT: .LBB103_3: ; %end +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -223728,6 +224828,116 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB103_4: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v49, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: v_mov_b32_e32 v36, v43 +; SI-NEXT: v_mov_b32_e32 v39, v54 +; SI-NEXT: v_mov_b32_e32 v28, v22 +; SI-NEXT: v_mov_b32_e32 v35, v21 +; SI-NEXT: v_mov_b32_e32 v33, v15 +; SI-NEXT: v_mov_b32_e32 v32, v19 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_branch .LBB103_2 ; ; VI-LABEL: bitcast_v64f16_to_v64bf16_scalar: ; VI: ; %bb.0: @@ -224058,313 +225268,537 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:88 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v28 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v33 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v62 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v63 -; SI-NEXT: ; kill: killed $vgpr45 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v47 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v43 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v40 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; kill: killed $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; kill: killed $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:132 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v34 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v43 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v27 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; kill: killed $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v62 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; kill: killed $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -224376,834 +225810,677 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v47 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v60 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v56 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v3 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v40 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v32 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v13 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v18 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v17 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v54 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: .LBB104_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB104_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v34 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v51 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v22, v34, v22, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: v_alignbit_b32 v15, v22, v15, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: v_alignbit_b32 v15, v19, v15, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v10 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; SI-NEXT: v_alignbit_b32 v13, v18, v13, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_alignbit_b32 v13, v17, v13, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_alignbit_b32 v10, v13, v10, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v39, v10, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 +; SI-NEXT: v_or_b32_e32 v29, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_or_b32_e32 v30, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v34, v10, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v62, v10, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v37, v10, v11 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v59 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v19 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_or_b32_e32 v13, v8, v9 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_alignbit_b32 v4, v8, v4, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v4, v10, v4, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v61, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v19, v10, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v4, v13, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v14, v8, v9 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v15, v8, v15 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v46, v15, v27, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v16, v8, v16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v45, v16, v26, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v44, v14, v31, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v26 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v26 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v10, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v42, v12, v33, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_alignbit_b32 v4, v16, v4, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v10, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v31 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v44 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v41, v11, v38, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v23, v10, v8, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v32 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v30 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v55, v10, v48, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v33 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v18, v16, v4, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 -; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v54, v37, v49, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v24, v10, v8, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v53, v62, v50, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v48 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v52, v34, v51, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v49 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v50 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v50, v29, v43, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v51 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v51, v30, v40, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_alignbit_b32 v37, v38, v11, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v11, v37, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_alignbit_b32 v39, v48, v12, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v35, v36, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v12, v39, v12, 16 -; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v40 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v43 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v25 +; SI-NEXT: v_alignbit_b32 v49, v39, v47, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v47 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v49, v50, v13, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v13, v49, v13, 16 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v56, 0xffff0000, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v51, v52, v14, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 -; SI-NEXT: v_alignbit_b32 v14, v51, v14, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v56 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v48, v43, v57, 16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_alignbit_b32 v2, v47, v56, 16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_alignbit_b32 v41, v42, v15, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v57 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: .LBB104_4: ; %end -; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_alignbit_b32 v3, v13, v32, 16 +; SI-NEXT: v_alignbit_b32 v38, v40, v58, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v58 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v18 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; SI-NEXT: v_mov_b32_e32 v17, v39 +; SI-NEXT: v_mov_b32_e32 v39, v62 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: .LBB104_4: ; %end +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -225771,74 +227048,74 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v63, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v0, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v0, v61, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v0, v60, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v0, v59, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v0, v58, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v0, v57, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v0, v56, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v0, v47, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v0, v46, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v0, v45, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v0, v44, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v0, v43, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v0, v42, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 -; VI-NEXT: v_alignbit_b32 v31, v16, v55, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; VI-NEXT: v_alignbit_b32 v30, v16, v54, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 -; VI-NEXT: v_alignbit_b32 v29, v16, v53, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; VI-NEXT: v_alignbit_b32 v28, v16, v52, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 -; VI-NEXT: v_alignbit_b32 v27, v16, v51, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 -; VI-NEXT: v_alignbit_b32 v26, v16, v50, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 -; VI-NEXT: v_alignbit_b32 v25, v16, v49, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v24 -; VI-NEXT: v_alignbit_b32 v24, v16, v48, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 -; VI-NEXT: v_alignbit_b32 v23, v16, v39, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 -; VI-NEXT: v_alignbit_b32 v22, v16, v38, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v21 -; VI-NEXT: v_alignbit_b32 v21, v16, v37, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 -; VI-NEXT: v_alignbit_b32 v20, v16, v36, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 -; VI-NEXT: v_alignbit_b32 v19, v16, v35, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 -; VI-NEXT: v_alignbit_b32 v18, v16, v34, 16 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 -; VI-NEXT: v_alignbit_b32 v1, v0, v41, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 -; VI-NEXT: v_alignbit_b32 v17, v16, v33, 16 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_sdwa v15, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; VI-NEXT: v_or_b32_sdwa v14, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; VI-NEXT: v_or_b32_sdwa v13, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12 +; VI-NEXT: v_or_b32_sdwa v12, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; VI-NEXT: v_or_b32_sdwa v11, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; VI-NEXT: v_or_b32_sdwa v10, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; VI-NEXT: v_or_b32_sdwa v9, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; VI-NEXT: v_or_b32_sdwa v8, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; VI-NEXT: v_or_b32_sdwa v7, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; VI-NEXT: v_or_b32_sdwa v6, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; VI-NEXT: v_or_b32_sdwa v5, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; VI-NEXT: v_or_b32_sdwa v4, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v3, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; VI-NEXT: v_or_b32_sdwa v2, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31 +; VI-NEXT: v_or_b32_sdwa v31, v55, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 +; VI-NEXT: v_or_b32_sdwa v30, v54, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; VI-NEXT: v_or_b32_sdwa v29, v53, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 +; VI-NEXT: v_or_b32_sdwa v28, v52, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v27 +; VI-NEXT: v_or_b32_sdwa v27, v51, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v26 +; VI-NEXT: v_or_b32_sdwa v26, v50, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v25 +; VI-NEXT: v_or_b32_sdwa v25, v49, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v24 +; VI-NEXT: v_or_b32_sdwa v24, v48, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v23 +; VI-NEXT: v_or_b32_sdwa v23, v39, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v22 +; VI-NEXT: v_or_b32_sdwa v22, v38, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v21 +; VI-NEXT: v_or_b32_sdwa v21, v37, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v20 +; VI-NEXT: v_or_b32_sdwa v20, v36, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v19 +; VI-NEXT: v_or_b32_sdwa v19, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v18 +; VI-NEXT: v_or_b32_sdwa v18, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v32 +; VI-NEXT: v_or_b32_sdwa v1, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; VI-NEXT: v_or_b32_sdwa v17, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_alignbit_b32 v0, v0, v40, 16 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB104_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -227562,284 +228839,308 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v33 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53 -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v55 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v54 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v40 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v41 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v43 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v57 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v35 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v36 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v59 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v59 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v38, v60 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v60 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v51, v61 +; SI-NEXT: v_mov_b32_e32 v28, v31 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_mov_b32_e32 v29, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v53, v42 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v40 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 +; SI-NEXT: v_mov_b32_e32 v20, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v35, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_mov_b32_e32 v62, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v46 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v31, v3 +; SI-NEXT: v_mov_b32_e32 v3, v54 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v49 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v56 +; SI-NEXT: v_mov_b32_e32 v56, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mov_b32_e32 v48, v11 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v44 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v22 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_mov_b32_e32 v25, v29 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v49 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v45 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_mov_b32_e32 v11, v45 +; SI-NEXT: v_mov_b32_e32 v45, v46 +; SI-NEXT: v_mov_b32_e32 v46, v62 +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_mov_b32_e32 v7, v24 +; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_mov_b32_e32 v29, v15 +; SI-NEXT: v_mov_b32_e32 v15, v44 +; SI-NEXT: v_mov_b32_e32 v44, v63 ; SI-NEXT: s_branch .LBB105_3 ; SI-NEXT: .LBB105_2: -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: v_mov_b32_e32 v53, v50 +; SI-NEXT: v_mov_b32_e32 v50, v11 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -227871,674 +229172,716 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v51, v61 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v29, v31 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_mov_b32_e32 v24, v56 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v28, v31 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_mov_b32_e32 v52, v13 +; SI-NEXT: v_mov_b32_e32 v38, v60 +; SI-NEXT: v_mov_b32_e32 v36, v59 +; SI-NEXT: v_mov_b32_e32 v20, v3 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v42, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v16, v41 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v40 +; SI-NEXT: v_or_b32_e32 v1, v3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v42 +; SI-NEXT: v_or_b32_e32 v1, v3, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v1, v3, v6 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v41, v3, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v22, v3, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v23, v3, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v45, v3, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v28, v3, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v25, v3, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v47, v3, v7 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; SI-NEXT: v_mov_b32_e32 v13, v45 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; SI-NEXT: v_mov_b32_e32 v60, v23 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v56, v3, v7 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29 +; SI-NEXT: v_or_b32_e32 v4, v3, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v30 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v26, v11, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v32 +; SI-NEXT: v_or_b32_e32 v1, v16, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 -; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 +; SI-NEXT: v_or_b32_e32 v1, v16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16 -; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v2, v32, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v3, v33, v2 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 -; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16 -; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v36 +; SI-NEXT: v_or_b32_e32 v5, v35, v4 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v57, v47, v4, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v6, v36, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v59, v25, v5, 16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; SI-NEXT: v_or_b32_e32 v11, v37, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29 -; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v38, v17 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v42 +; SI-NEXT: v_alignbit_b32 v9, v23, v16, 16 +; SI-NEXT: v_mov_b32_e32 v23, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v49, 0xffff0000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v48, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v50 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v11, v49, v32 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15 -; SI-NEXT: v_mov_b32_e32 v15, v24 +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v51 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v11, v50, v33 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v51, 0xffff0000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16 -; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v11, v51, v19 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v54 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16 -; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v11, v52, v35 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v11, v54, v20 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v44 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16 +; SI-NEXT: v_or_b32_e32 v11, v55, v36 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v30 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16 -; SI-NEXT: v_mov_b32_e32 v14, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v29 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; SI-NEXT: v_mov_b32_e32 v31, v28 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 +; SI-NEXT: v_alignbit_b32 v53, v26, v34, 16 +; SI-NEXT: v_alignbit_b32 v34, v3, v1, 16 +; SI-NEXT: v_alignbit_b32 v12, v31, v6, 16 +; SI-NEXT: v_alignbit_b32 v8, v22, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v6, v41, v32, 16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_alignbit_b32 v14, v7, v63, 16 +; SI-NEXT: v_alignbit_b32 v30, v56, v2, 16 +; SI-NEXT: v_alignbit_b32 v10, v13, v17, 16 +; SI-NEXT: v_mov_b32_e32 v22, v41 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v51, v18, v36, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v32, v35, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v21, v1 +; SI-NEXT: v_alignbit_b32 v5, v1, v33, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v4, v1, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; SI-NEXT: v_alignbit_b32 v1, v19, v20, 16 +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_or_b32_e32 v11, v11, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v7, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v62 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v59 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v46 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v45 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v44 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_or_b32_e32 v3, v3, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v29 +; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -228624,19 +229967,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s29, 16 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v14, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_alignbit_b32 v15, v4, v3, 16 +; VI-NEXT: v_or_b32_sdwa v15, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 @@ -228648,9 +229991,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v13, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228666,9 +230009,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v12, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228684,9 +230027,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v11, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228702,9 +230045,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228720,9 +230063,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v9, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228738,9 +230081,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v8, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228756,9 +230099,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v7, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228774,9 +230117,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228792,9 +230135,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228810,9 +230153,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228828,9 +230171,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 @@ -228846,9 +230189,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 @@ -228864,9 +230207,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 ; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v18, s4, v0 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 ; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 @@ -228882,8 +230225,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v16 ; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 @@ -229140,38 +230483,38 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v31, v31, v55, 16 -; VI-NEXT: v_alignbit_b32 v30, v30, v54, 16 -; VI-NEXT: v_alignbit_b32 v29, v29, v53, 16 -; VI-NEXT: v_alignbit_b32 v28, v28, v52, 16 -; VI-NEXT: v_alignbit_b32 v27, v27, v51, 16 -; VI-NEXT: v_alignbit_b32 v26, v26, v50, 16 -; VI-NEXT: v_alignbit_b32 v25, v25, v49, 16 -; VI-NEXT: v_alignbit_b32 v24, v24, v48, 16 -; VI-NEXT: v_alignbit_b32 v23, v23, v39, 16 -; VI-NEXT: v_alignbit_b32 v22, v22, v38, 16 -; VI-NEXT: v_alignbit_b32 v21, v21, v37, 16 -; VI-NEXT: v_alignbit_b32 v20, v20, v36, 16 -; VI-NEXT: v_alignbit_b32 v19, v19, v35, 16 -; VI-NEXT: v_alignbit_b32 v32, v32, v34, 16 -; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 -; VI-NEXT: v_alignbit_b32 v16, v16, v18, 16 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_sdwa v31, v55, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v54, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v53, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v52, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v51, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v50, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v49, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v48, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v39, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v36, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v35, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v32, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v33, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB105_5 ; VI-NEXT: .LBB105_3: ; VI-NEXT: s_branch .LBB105_2 @@ -230643,8 +231986,8 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr21 @@ -230706,10 +232049,10 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 @@ -231057,7 +232400,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v9, v20, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -231066,7 +232409,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v10 @@ -231078,262 +232421,294 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v21 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v28 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v32 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v35 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v52 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload @@ -231637,54 +233012,54 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: s_mov_b32 s74, s23 +; SI-NEXT: s_mov_b32 s72, s21 +; SI-NEXT: s_mov_b32 s61, s18 ; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane ; SI-NEXT: s_mov_b32 s60, s16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s17, 0 -; SI-NEXT: s_mov_b32 s61, s19 ; SI-NEXT: v_writelane_b32 v41, s60, 1 -; SI-NEXT: s_mov_b32 s63, s18 -; SI-NEXT: v_writelane_b32 v41, s61, 2 -; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: v_writelane_b32 v41, s63, 3 +; SI-NEXT: v_writelane_b32 v41, s19, 2 +; SI-NEXT: v_writelane_b32 v41, s61, 3 ; SI-NEXT: v_writelane_b32 v41, s72, 4 -; SI-NEXT: s_mov_b32 s74, s23 ; SI-NEXT: v_writelane_b32 v41, s20, 5 ; SI-NEXT: v_writelane_b32 v41, s74, 6 -; SI-NEXT: s_mov_b32 s75, s25 +; SI-NEXT: s_mov_b32 s76, s25 ; SI-NEXT: v_writelane_b32 v41, s22, 7 -; SI-NEXT: v_writelane_b32 v41, s75, 8 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: v_writelane_b32 v41, s76, 8 +; SI-NEXT: s_mov_b32 s78, s27 ; SI-NEXT: v_writelane_b32 v41, s24, 9 -; SI-NEXT: v_writelane_b32 v41, s76, 10 -; SI-NEXT: s_mov_b32 s93, s29 +; SI-NEXT: v_writelane_b32 v41, s78, 10 +; SI-NEXT: s_mov_b32 s88, s29 ; SI-NEXT: v_writelane_b32 v41, s26, 11 -; SI-NEXT: v_writelane_b32 v41, s93, 12 -; SI-NEXT: v_readfirstlane_b32 s16, v2 +; SI-NEXT: v_writelane_b32 v41, s88, 12 +; SI-NEXT: v_readfirstlane_b32 s77, v2 ; SI-NEXT: v_writelane_b32 v41, s28, 13 -; SI-NEXT: v_readfirstlane_b32 s73, v4 -; SI-NEXT: v_writelane_b32 v41, s16, 14 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_writelane_b32 v41, s73, 15 -; SI-NEXT: v_readfirstlane_b32 s90, v6 -; SI-NEXT: v_writelane_b32 v41, s89, 16 -; SI-NEXT: v_readfirstlane_b32 s91, v5 -; SI-NEXT: v_writelane_b32 v41, s90, 17 -; SI-NEXT: v_readfirstlane_b32 s34, v8 -; SI-NEXT: v_writelane_b32 v41, s91, 18 -; SI-NEXT: v_readfirstlane_b32 s35, v7 -; SI-NEXT: v_writelane_b32 v41, s34, 19 -; SI-NEXT: v_readfirstlane_b32 s36, v10 -; SI-NEXT: v_writelane_b32 v41, s35, 20 -; SI-NEXT: v_writelane_b32 v40, s96, 32 -; SI-NEXT: v_readfirstlane_b32 s37, v9 -; SI-NEXT: v_writelane_b32 v41, s36, 21 +; SI-NEXT: v_readfirstlane_b32 s79, v4 +; SI-NEXT: v_writelane_b32 v41, s77, 14 +; SI-NEXT: v_readfirstlane_b32 s90, v3 +; SI-NEXT: v_writelane_b32 v41, s79, 15 +; SI-NEXT: v_readfirstlane_b32 s91, v6 +; SI-NEXT: v_writelane_b32 v41, s90, 16 +; SI-NEXT: v_readfirstlane_b32 s92, v5 +; SI-NEXT: v_writelane_b32 v41, s91, 17 +; SI-NEXT: v_readfirstlane_b32 s93, v8 +; SI-NEXT: v_writelane_b32 v41, s92, 18 +; SI-NEXT: v_readfirstlane_b32 s94, v7 +; SI-NEXT: v_writelane_b32 v41, s93, 19 +; SI-NEXT: v_readfirstlane_b32 s95, v10 +; SI-NEXT: v_writelane_b32 v41, s94, 20 +; SI-NEXT: v_readfirstlane_b32 s30, v9 +; SI-NEXT: v_writelane_b32 v41, s95, 21 +; SI-NEXT: v_readfirstlane_b32 s31, v12 +; SI-NEXT: v_writelane_b32 v41, s30, 22 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s62, v31 +; SI-NEXT: v_readfirstlane_b32 s21, v31 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s80, v32 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v33 +; SI-NEXT: v_readfirstlane_b32 s75, v33 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 @@ -231696,20 +233071,25 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s84, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s23, v35 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v36 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s87, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s6, v37 +; SI-NEXT: v_readfirstlane_b32 s18, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: v_writelane_b32 v41, s31, 23 +; SI-NEXT: v_readfirstlane_b32 s34, v11 +; SI-NEXT: v_readfirstlane_b32 s35, v14 +; SI-NEXT: v_readfirstlane_b32 s36, v13 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_readfirstlane_b32 s37, v16 ; SI-NEXT: v_writelane_b32 v40, s97, 33 -; SI-NEXT: v_readfirstlane_b32 s38, v12 -; SI-NEXT: v_writelane_b32 v41, s37, 22 +; SI-NEXT: v_readfirstlane_b32 s38, v15 ; SI-NEXT: v_writelane_b32 v40, s98, 34 ; SI-NEXT: v_readfirstlane_b32 s14, v30 ; SI-NEXT: v_readfirstlane_b32 s15, v29 @@ -231719,21 +233099,13 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: v_readfirstlane_b32 s11, v25 ; SI-NEXT: v_readfirstlane_b32 s8, v24 ; SI-NEXT: v_readfirstlane_b32 s9, v23 -; SI-NEXT: v_readfirstlane_b32 s88, v22 -; SI-NEXT: v_readfirstlane_b32 s29, v21 -; SI-NEXT: v_readfirstlane_b32 s79, v20 -; SI-NEXT: v_readfirstlane_b32 s27, v19 -; SI-NEXT: v_readfirstlane_b32 s78, v18 -; SI-NEXT: v_readfirstlane_b32 s25, v17 -; SI-NEXT: v_readfirstlane_b32 s77, v16 -; SI-NEXT: v_readfirstlane_b32 s23, v15 -; SI-NEXT: v_readfirstlane_b32 s39, v14 -; SI-NEXT: v_readfirstlane_b32 s21, v13 -; SI-NEXT: v_readfirstlane_b32 s19, v11 -; SI-NEXT: v_readfirstlane_b32 s18, v1 -; SI-NEXT: v_writelane_b32 v41, s38, 23 +; SI-NEXT: v_readfirstlane_b32 s89, v22 +; SI-NEXT: v_readfirstlane_b32 s7, v21 +; SI-NEXT: v_readfirstlane_b32 s25, v20 +; SI-NEXT: v_readfirstlane_b32 s29, v19 +; SI-NEXT: v_readfirstlane_b32 s39, v18 +; SI-NEXT: v_readfirstlane_b32 s27, v17 ; SI-NEXT: v_writelane_b32 v40, s99, 35 -; SI-NEXT: v_writelane_b32 v41, s39, 24 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: s_waitcnt vmcnt(11) @@ -231753,261 +233125,284 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s42, v34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: v_writelane_b32 v41, s5, 24 +; SI-NEXT: v_writelane_b32 v41, s34, 25 +; SI-NEXT: v_writelane_b32 v41, s35, 26 +; SI-NEXT: v_writelane_b32 v41, s36, 27 +; SI-NEXT: v_writelane_b32 v41, s37, 28 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s43, v35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s40, v36 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s41, v37 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_writelane_b32 v41, s38, 29 +; SI-NEXT: v_writelane_b32 v41, s39, 30 ; SI-NEXT: s_cbranch_scc0 .LBB107_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshl_b32 s4, s60, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 25 -; SI-NEXT: s_lshl_b32 s4, s63, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 26 -; SI-NEXT: s_lshl_b32 s4, s20, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 27 -; SI-NEXT: s_lshl_b32 s4, s22, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: s_lshl_b32 s4, s24, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 29 -; SI-NEXT: s_lshl_b32 s4, s26, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 30 -; SI-NEXT: s_lshl_b32 s4, s28, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 31 -; SI-NEXT: s_lshl_b32 s4, s18, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 32 -; SI-NEXT: s_lshl_b32 s4, s89, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 33 -; SI-NEXT: s_lshl_b32 s4, s91, 16 +; SI-NEXT: s_lshl_b32 s4, s17, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 31 +; SI-NEXT: s_lshl_b32 s4, s61, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 34 -; SI-NEXT: s_lshl_b32 s4, s35, 16 -; SI-NEXT: v_writelane_b32 v41, s4, 35 -; SI-NEXT: s_lshl_b32 s4, s37, 16 -; SI-NEXT: s_lshl_b32 s7, s17, 16 -; SI-NEXT: s_lshl_b32 s96, s61, 16 -; SI-NEXT: s_lshl_b32 s99, s72, 16 -; SI-NEXT: s_lshl_b32 s97, s74, 16 -; SI-NEXT: s_lshl_b32 s92, s75, 16 -; SI-NEXT: s_lshl_b32 s94, s76, 16 -; SI-NEXT: s_lshl_b32 s95, s93, 16 -; SI-NEXT: s_lshl_b32 s93, s16, 16 -; SI-NEXT: s_lshl_b32 s30, s73, 16 -; SI-NEXT: s_lshl_b32 s31, s90, 16 -; SI-NEXT: s_lshl_b32 s34, s34, 16 +; SI-NEXT: s_lshl_b32 s4, s19, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 33 +; SI-NEXT: s_lshl_b32 s4, s20, 16 ; SI-NEXT: v_writelane_b32 v41, s4, 36 -; SI-NEXT: s_lshl_b32 s35, s36, 16 -; SI-NEXT: s_lshl_b32 s86, s19, 16 -; SI-NEXT: s_lshl_b32 s36, s38, 16 -; SI-NEXT: s_lshl_b32 s22, s21, 16 -; SI-NEXT: s_lshl_b32 s37, s39, 16 -; SI-NEXT: s_lshl_b32 s24, s23, 16 -; SI-NEXT: s_lshl_b32 s38, s77, 16 -; SI-NEXT: s_lshl_b32 s28, s25, 16 -; SI-NEXT: s_lshl_b32 s39, s78, 16 -; SI-NEXT: s_lshl_b32 s61, s27, 16 -; SI-NEXT: s_lshl_b32 s48, s79, 16 -; SI-NEXT: s_lshl_b32 s89, s29, 16 -; SI-NEXT: s_lshl_b32 s49, s88, 16 -; SI-NEXT: s_lshl_b32 s60, s9, 16 -; SI-NEXT: s_lshl_b32 s50, s8, 16 -; SI-NEXT: s_lshl_b32 s90, s11, 16 -; SI-NEXT: s_lshl_b32 s91, s10, 16 -; SI-NEXT: s_lshl_b32 s70, s13, 16 -; SI-NEXT: s_lshl_b32 s51, s12, 16 -; SI-NEXT: s_lshl_b32 s71, s15, 16 -; SI-NEXT: s_lshl_b32 s52, s14, 16 -; SI-NEXT: s_lshl_b32 s20, s41, 16 -; SI-NEXT: s_lshl_b32 s53, s40, 16 -; SI-NEXT: s_lshl_b32 s81, s43, 16 -; SI-NEXT: s_lshl_b32 s54, s42, 16 -; SI-NEXT: s_lshl_b32 s63, s45, 16 -; SI-NEXT: s_lshl_b32 s55, s44, 16 -; SI-NEXT: s_lshl_b32 s72, s47, 16 -; SI-NEXT: s_lshl_b32 s64, s46, 16 -; SI-NEXT: s_lshl_b32 s82, s57, 16 -; SI-NEXT: s_lshl_b32 s65, s56, 16 -; SI-NEXT: s_lshl_b32 s74, s59, 16 -; SI-NEXT: s_lshl_b32 s66, s58, 16 -; SI-NEXT: s_lshl_b32 s75, s87, 16 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: s_lshl_b32 s67, s6, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 16 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: s_lshl_b32 s68, s68, 16 -; SI-NEXT: s_lshl_b32 s85, s84, 16 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: s_lshl_b32 s69, s69, 16 -; SI-NEXT: s_lshl_b32 s17, s80, 16 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: s_lshl_b32 s26, s62, 16 +; SI-NEXT: s_lshl_b32 s4, s72, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 35 +; SI-NEXT: s_lshl_b32 s4, s74, 16 +; SI-NEXT: s_lshl_b32 s16, s22, 16 +; SI-NEXT: v_writelane_b32 v41, s4, 37 +; SI-NEXT: s_lshl_b32 s6, s24, 16 +; SI-NEXT: s_lshl_b32 s73, s76, 16 +; SI-NEXT: s_lshl_b32 s98, s26, 16 +; SI-NEXT: s_lshl_b32 s63, s78, 16 +; SI-NEXT: s_lshl_b32 s97, s28, 16 +; SI-NEXT: s_lshl_b32 s62, s88, 16 +; SI-NEXT: s_lshl_b32 s85, s5, 16 +; SI-NEXT: s_lshl_b32 s96, s77, 16 +; SI-NEXT: s_lshl_b32 s81, s90, 16 +; SI-NEXT: s_lshl_b32 s99, s79, 16 +; SI-NEXT: s_lshl_b32 s70, s92, 16 +; SI-NEXT: s_lshl_b32 s86, s91, 16 +; SI-NEXT: s_lshl_b32 s68, s94, 16 +; SI-NEXT: s_lshl_b32 s82, s93, 16 +; SI-NEXT: s_lshl_b32 s66, s30, 16 +; SI-NEXT: s_lshl_b32 s71, s95, 16 +; SI-NEXT: s_lshl_b32 s64, s34, 16 +; SI-NEXT: s_lshl_b32 s69, s31, 16 +; SI-NEXT: s_lshl_b32 s54, s36, 16 +; SI-NEXT: s_lshl_b32 s67, s35, 16 +; SI-NEXT: s_lshl_b32 s52, s38, 16 +; SI-NEXT: s_lshl_b32 s65, s37, 16 +; SI-NEXT: s_lshl_b32 s50, s27, 16 +; SI-NEXT: s_lshl_b32 s55, s39, 16 +; SI-NEXT: s_lshl_b32 s49, s29, 16 +; SI-NEXT: s_lshl_b32 s53, s25, 16 +; SI-NEXT: s_lshl_b32 s51, s7, 16 +; SI-NEXT: s_lshl_b32 s39, s89, 16 +; SI-NEXT: s_lshl_b32 s48, s9, 16 +; SI-NEXT: s_lshl_b32 s38, s8, 16 +; SI-NEXT: s_lshl_b32 s37, s11, 16 +; SI-NEXT: s_lshl_b32 s35, s10, 16 +; SI-NEXT: s_lshl_b32 s36, s13, 16 +; SI-NEXT: s_lshl_b32 s31, s12, 16 +; SI-NEXT: s_lshl_b32 s34, s15, 16 +; SI-NEXT: s_lshl_b32 s95, s14, 16 +; SI-NEXT: s_lshl_b32 s30, s41, 16 +; SI-NEXT: s_lshl_b32 s93, s40, 16 +; SI-NEXT: s_lshl_b32 s94, s43, 16 +; SI-NEXT: s_lshl_b32 s91, s42, 16 +; SI-NEXT: s_lshl_b32 s92, s45, 16 +; SI-NEXT: s_lshl_b32 s90, s44, 16 +; SI-NEXT: s_lshl_b32 s28, s47, 16 +; SI-NEXT: s_lshl_b32 s88, s46, 16 +; SI-NEXT: s_lshl_b32 s26, s57, 16 +; SI-NEXT: s_lshl_b32 s78, s56, 16 +; SI-NEXT: s_lshl_b32 s24, s59, 16 +; SI-NEXT: s_lshl_b32 s76, s58, 16 +; SI-NEXT: s_lshl_b32 s22, s87, 16 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: s_lshl_b32 s74, s18, 16 +; SI-NEXT: s_lshl_b32 s20, s83, 16 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: s_lshl_b32 s72, s23, 16 +; SI-NEXT: s_lshl_b32 s19, s84, 16 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: s_lshl_b32 s17, s75, 16 +; SI-NEXT: s_lshl_b32 s61, s80, 16 +; SI-NEXT: s_lshl_b32 s60, s21, 16 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB107_3 ; SI-NEXT: .LBB107_2: -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s16, s68 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s73, s6 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s6, s62 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: s_mov_b32 s98, s69 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s79, s23 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s77, s18 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: s_mov_b32 s18, s75 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr81 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr81 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr85 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; kill: killed $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr19 ; SI-NEXT: ; implicit-def: $sgpr17 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: .LBB107_3: ; %Flow ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: s_mov_b32 s5, s17 -; SI-NEXT: s_mov_b32 s17, s86 -; SI-NEXT: s_mov_b32 s86, s7 +; SI-NEXT: s_mov_b32 s5, s60 +; SI-NEXT: s_mov_b32 s60, s17 +; SI-NEXT: s_mov_b32 s4, s61 +; SI-NEXT: s_mov_b32 s17, s72 +; SI-NEXT: s_mov_b32 s61, s74 +; SI-NEXT: s_mov_b32 s72, s76 +; SI-NEXT: s_mov_b32 s74, s78 +; SI-NEXT: s_mov_b32 s76, s88 +; SI-NEXT: s_mov_b32 s78, s90 +; SI-NEXT: s_mov_b32 s88, s91 +; SI-NEXT: s_mov_b32 s90, s92 +; SI-NEXT: s_mov_b32 s91, s93 +; SI-NEXT: s_mov_b32 s92, s94 +; SI-NEXT: s_mov_b32 s93, s95 +; SI-NEXT: s_mov_b32 s94, s30 +; SI-NEXT: s_mov_b32 s95, s31 +; SI-NEXT: s_mov_b32 s30, s34 +; SI-NEXT: s_mov_b32 s31, s35 +; SI-NEXT: s_mov_b32 s34, s36 +; SI-NEXT: s_mov_b32 s35, s37 +; SI-NEXT: s_mov_b32 s36, s48 +; SI-NEXT: s_mov_b32 s37, s51 +; SI-NEXT: s_mov_b32 s48, s53 +; SI-NEXT: s_mov_b32 s51, s6 +; SI-NEXT: s_mov_b32 s53, s16 ; SI-NEXT: s_cbranch_vccnz .LBB107_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_lshl_b32 s5, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 24 -; SI-NEXT: s_lshl_b32 s20, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 23 -; SI-NEXT: s_lshl_b32 s17, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 22 -; SI-NEXT: s_lshl_b32 s61, s16, 16 -; SI-NEXT: s_add_i32 s16, s6, 3 -; SI-NEXT: v_readlane_b32 s6, v41, 21 -; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_lshl_b32 s7, s6, 16 -; SI-NEXT: v_readlane_b32 s6, v41, 20 -; SI-NEXT: s_or_b32 s7, s7, s16 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v41, 19 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_and_b32 s19, s19, 0xffff -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_readlane_b32 s16, v41, 18 -; SI-NEXT: s_lshl_b32 s60, s98, 16 -; SI-NEXT: s_or_b32 s17, s17, s19 -; SI-NEXT: s_add_i32 s98, s16, 3 -; SI-NEXT: v_readlane_b32 s19, v41, 17 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_and_b32 s16, s98, 0xffff -; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_and_b32 s21, s21, 0xffff -; SI-NEXT: s_or_b32 s16, s19, s16 -; SI-NEXT: v_readlane_b32 s19, v41, 16 ; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 30 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s13, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 29 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_lshl_b32 s11, s25, 16 +; SI-NEXT: s_add_i32 s25, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 28 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_lshl_b32 s15, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 27 +; SI-NEXT: s_add_i32 s23, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 26 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_lshl_b32 s20, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 25 +; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_or_b32 s20, s20, s21 -; SI-NEXT: s_add_i32 s96, s19, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 15 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_add_i32 s21, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 23 ; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s9, s89, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_lshl_b32 s19, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 22 +; SI-NEXT: s_or_b32 s7, s9, s7 ; SI-NEXT: s_and_b32 s9, s29, 0xffff -; SI-NEXT: s_lshl_b32 s11, s88, 16 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_and_b32 s19, s96, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_add_i32 s16, s6, 3 +; SI-NEXT: v_readlane_b32 s6, v41, 21 ; SI-NEXT: s_or_b32 s9, s11, s9 ; SI-NEXT: s_and_b32 s11, s27, 0xffff -; SI-NEXT: s_lshl_b32 s13, s79, 16 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_or_b32 s19, s21, s19 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s21, v41, 14 -; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s6, 16 +; SI-NEXT: v_readlane_b32 s6, v41, 20 ; SI-NEXT: s_or_b32 s11, s13, s11 ; SI-NEXT: s_and_b32 s13, s25, 0xffff -; SI-NEXT: s_lshl_b32 s15, s78, 16 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_and_b32 s18, s18, 0xffff -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 19 ; SI-NEXT: s_or_b32 s13, s15, s13 ; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_lshl_b32 s22, s77, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s60, s18, 16 +; SI-NEXT: s_or_b32 s15, s20, s15 +; SI-NEXT: s_and_b32 s20, s21, 0xffff +; SI-NEXT: s_or_b32 s6, s17, s6 +; SI-NEXT: v_readlane_b32 s17, v41, 18 +; SI-NEXT: v_readlane_b32 s18, v41, 17 +; SI-NEXT: s_or_b32 s19, s19, s20 +; SI-NEXT: s_add_i32 s98, s17, 3 +; SI-NEXT: s_lshl_b32 s20, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 16 +; SI-NEXT: s_and_b32 s17, s98, 0xffff +; SI-NEXT: s_add_i32 s96, s18, 3 +; SI-NEXT: v_readlane_b32 s18, v41, 15 +; SI-NEXT: s_or_b32 s17, s20, s17 +; SI-NEXT: s_and_b32 s20, s96, 0xffff +; SI-NEXT: s_lshl_b32 s21, s18, 16 +; SI-NEXT: v_readlane_b32 s18, v41, 24 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: v_readlane_b32 s21, v41, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_lshl_b32 s21, s21, 16 ; SI-NEXT: s_or_b32 s18, s21, s18 ; SI-NEXT: v_readlane_b32 s21, v41, 13 -; SI-NEXT: s_or_b32 s15, s22, s15 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: v_readlane_b32 s22, v41, 12 ; SI-NEXT: s_and_b32 s21, s21, 0xffff @@ -232049,42 +233444,20 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_and_b32 s27, s27, 0xffff ; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_or_b32 s27, s28, s27 -; SI-NEXT: s_add_i32 s27, s27, 0x30000 -; SI-NEXT: s_add_i32 s26, s26, 0x30000 -; SI-NEXT: s_and_b32 s86, s27, 0xffff0000 -; SI-NEXT: s_lshl_b32 s27, s27, 16 -; SI-NEXT: s_add_i32 s25, s25, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s27, 25 -; SI-NEXT: s_and_b32 s96, s26, 0xffff0000 -; SI-NEXT: s_lshl_b32 s26, s26, 16 -; SI-NEXT: s_add_i32 s24, s24, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s26, 26 -; SI-NEXT: s_and_b32 s99, s25, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_add_i32 s23, s23, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s25, 27 -; SI-NEXT: s_and_b32 s97, s24, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_add_i32 s80, s80, 3 -; SI-NEXT: s_add_i32 s22, s22, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s24, 28 -; SI-NEXT: s_and_b32 s92, s23, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_add_i32 s27, s27, 0x30000 ; SI-NEXT: s_and_b32 s4, s80, 0xffff ; SI-NEXT: s_add_i32 s84, s84, 3 -; SI-NEXT: s_add_i32 s21, s21, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s23, 29 -; SI-NEXT: s_and_b32 s94, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_and_b32 s28, s27, 0xffff0000 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s84, 0xffff ; SI-NEXT: s_add_i32 s83, s83, 3 -; SI-NEXT: s_add_i32 s18, s18, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s22, 30 -; SI-NEXT: s_and_b32 s95, s21, 0xffff0000 -; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: s_add_i32 s26, s26, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s28, 31 +; SI-NEXT: s_lshl_b32 s27, s27, 16 ; SI-NEXT: s_or_b32 s5, s60, s5 ; SI-NEXT: s_and_b32 s60, s83, 0xffff +; SI-NEXT: s_lshl_b32 s61, s79, 16 ; SI-NEXT: s_add_i32 s87, s87, 3 ; SI-NEXT: s_add_i32 s59, s59, 3 ; SI-NEXT: s_add_i32 s57, s57, 3 @@ -232092,13 +233465,11 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s19, s19, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s21, 31 -; SI-NEXT: s_and_b32 s93, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_or_b32 s76, s61, s60 +; SI-NEXT: v_writelane_b32 v41, s27, 32 +; SI-NEXT: s_and_b32 s27, s26, 0xffff0000 +; SI-NEXT: s_or_b32 vcc_lo, s61, s60 ; SI-NEXT: s_and_b32 s60, s87, 0xffff -; SI-NEXT: s_lshl_b32 s61, s73, 16 +; SI-NEXT: s_lshl_b32 s61, s77, 16 ; SI-NEXT: s_and_b32 s59, s59, 0xffff ; SI-NEXT: s_lshl_b32 s58, s58, 16 ; SI-NEXT: s_and_b32 s57, s57, 0xffff @@ -232111,24 +233482,22 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_and_b32 s41, s41, 0xffff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_add_i32 s16, s16, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 32 -; SI-NEXT: s_lshl_b32 s18, s19, 16 -; SI-NEXT: s_or_b32 s75, s61, s60 +; SI-NEXT: s_add_i32 s25, s25, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s27, 33 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: s_or_b32 vcc_hi, s61, s60 ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s46, s46, s47 ; SI-NEXT: s_or_b32 s44, s44, s45 ; SI-NEXT: s_or_b32 s42, s42, s43 ; SI-NEXT: s_or_b32 s40, s40, s41 -; SI-NEXT: s_add_i32 s6, s6, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s18, 33 -; SI-NEXT: s_and_b32 s31, s16, 0xffff0000 -; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_writelane_b32 v41, s26, 34 +; SI-NEXT: s_and_b32 s26, s25, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 -; SI-NEXT: s_add_i32 s76, s76, 0x30000 -; SI-NEXT: s_add_i32 s75, s75, 0x30000 +; SI-NEXT: s_add_i32 vcc_lo, vcc_lo, 0x30000 +; SI-NEXT: s_add_i32 vcc_hi, vcc_hi, 0x30000 ; SI-NEXT: s_add_i32 s58, s58, 0x30000 ; SI-NEXT: s_add_i32 s56, s56, 0x30000 ; SI-NEXT: s_add_i32 s46, s46, 0x30000 @@ -232139,294 +233508,343 @@ define inreg <64 x bfloat> @bitcast_v64i16_to_v64bf16_scalar(<64 x i16> inreg %a ; SI-NEXT: s_add_i32 s12, s12, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s11, s11, 0x30000 ; SI-NEXT: s_add_i32 s13, s13, 0x30000 ; SI-NEXT: s_add_i32 s15, s15, 0x30000 -; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s19, s19, 0x30000 +; SI-NEXT: s_add_i32 s16, s16, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 ; SI-NEXT: s_add_i32 s17, s17, 0x30000 -; SI-NEXT: s_add_i32 s7, s7, 0x30000 -; SI-NEXT: v_writelane_b32 v41, s16, 34 -; SI-NEXT: s_and_b32 s34, s6, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s30, s19, 0xffff0000 -; SI-NEXT: v_writelane_b32 v41, s6, 35 -; SI-NEXT: s_and_b32 s35, s7, 0xffff0000 -; SI-NEXT: s_lshl_b32 s6, s7, 16 -; SI-NEXT: s_and_b32 s36, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_and_b32 s37, s20, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s20, 16 -; SI-NEXT: s_and_b32 s38, s15, 0xffff0000 -; SI-NEXT: s_lshl_b32 s24, s15, 16 -; SI-NEXT: s_and_b32 s39, s13, 0xffff0000 -; SI-NEXT: s_lshl_b32 s28, s13, 16 -; SI-NEXT: s_and_b32 s48, s11, 0xffff0000 -; SI-NEXT: s_lshl_b32 s61, s11, 16 -; SI-NEXT: s_and_b32 s49, s9, 0xffff0000 -; SI-NEXT: s_lshl_b32 s89, s9, 16 -; SI-NEXT: s_and_b32 s50, s8, 0xffff0000 -; SI-NEXT: s_lshl_b32 s60, s8, 16 -; SI-NEXT: s_and_b32 s91, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s90, s10, 16 -; SI-NEXT: s_and_b32 s51, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s70, s12, 16 -; SI-NEXT: s_and_b32 s52, s14, 0xffff0000 -; SI-NEXT: s_lshl_b32 s71, s14, 16 -; SI-NEXT: s_and_b32 s53, s40, 0xffff0000 -; SI-NEXT: s_lshl_b32 s20, s40, 16 -; SI-NEXT: s_and_b32 s54, s42, 0xffff0000 -; SI-NEXT: s_lshl_b32 s81, s42, 16 -; SI-NEXT: s_and_b32 s55, s44, 0xffff0000 -; SI-NEXT: s_lshl_b32 s63, s44, 16 -; SI-NEXT: s_and_b32 s64, s46, 0xffff0000 -; SI-NEXT: s_lshl_b32 s72, s46, 16 -; SI-NEXT: s_and_b32 s65, s56, 0xffff0000 -; SI-NEXT: s_lshl_b32 s82, s56, 16 -; SI-NEXT: s_and_b32 s66, s58, 0xffff0000 -; SI-NEXT: s_lshl_b32 s74, s58, 16 -; SI-NEXT: s_and_b32 s67, s75, 0xffff0000 -; SI-NEXT: s_lshl_b32 s75, s75, 16 -; SI-NEXT: s_and_b32 s68, s76, 0xffff0000 -; SI-NEXT: s_lshl_b32 s76, s76, 16 -; SI-NEXT: s_and_b32 s69, s5, 0xffff0000 -; SI-NEXT: s_lshl_b32 s85, s5, 16 -; SI-NEXT: s_and_b32 s26, s4, 0xffff0000 -; SI-NEXT: s_lshl_b32 s5, s4, 16 -; SI-NEXT: v_writelane_b32 v41, s6, 36 +; SI-NEXT: s_add_i32 s20, s20, 0x30000 +; SI-NEXT: s_add_i32 s18, s18, 0x30000 +; SI-NEXT: s_add_i32 s21, s21, 0x30000 +; SI-NEXT: s_add_i32 s22, s22, 0x30000 +; SI-NEXT: s_add_i32 s23, s23, 0x30000 +; SI-NEXT: s_add_i32 s24, s24, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s26, 35 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 36 +; SI-NEXT: s_and_b32 s25, s24, 0xffff0000 +; SI-NEXT: s_lshl_b32 s53, s24, 16 +; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 +; SI-NEXT: s_lshl_b32 s51, s23, 16 +; SI-NEXT: s_and_b32 s63, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s98, s22, 16 +; SI-NEXT: s_and_b32 s62, s21, 0xffff0000 +; SI-NEXT: s_lshl_b32 s97, s21, 16 +; SI-NEXT: s_and_b32 s96, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s85, s18, 16 +; SI-NEXT: s_and_b32 s99, s20, 0xffff0000 +; SI-NEXT: s_lshl_b32 s81, s20, 16 +; SI-NEXT: s_and_b32 s86, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s70, s17, 16 +; SI-NEXT: s_and_b32 s82, s6, 0xffff0000 +; SI-NEXT: s_lshl_b32 s68, s6, 16 +; SI-NEXT: s_and_b32 s71, s16, 0xffff0000 +; SI-NEXT: s_lshl_b32 s66, s16, 16 +; SI-NEXT: s_and_b32 s69, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s64, s19, 16 +; SI-NEXT: s_and_b32 s67, s15, 0xffff0000 +; SI-NEXT: s_lshl_b32 s54, s15, 16 +; SI-NEXT: s_and_b32 s65, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s52, s13, 16 +; SI-NEXT: s_and_b32 s55, s11, 0xffff0000 +; SI-NEXT: s_lshl_b32 s50, s11, 16 +; SI-NEXT: s_and_b32 s48, s9, 0xffff0000 +; SI-NEXT: s_lshl_b32 s49, s9, 16 +; SI-NEXT: s_and_b32 s39, s7, 0xffff0000 +; SI-NEXT: s_lshl_b32 s37, s7, 16 +; SI-NEXT: s_and_b32 s38, s8, 0xffff0000 +; SI-NEXT: s_lshl_b32 s36, s8, 16 +; SI-NEXT: s_and_b32 s31, s10, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s10, 16 +; SI-NEXT: s_and_b32 s95, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s34, s12, 16 +; SI-NEXT: s_and_b32 s93, s14, 0xffff0000 +; SI-NEXT: s_lshl_b32 s30, s14, 16 +; SI-NEXT: s_and_b32 s91, s40, 0xffff0000 +; SI-NEXT: s_lshl_b32 s94, s40, 16 +; SI-NEXT: s_and_b32 s88, s42, 0xffff0000 +; SI-NEXT: s_lshl_b32 s92, s42, 16 +; SI-NEXT: s_and_b32 s78, s44, 0xffff0000 +; SI-NEXT: s_lshl_b32 s90, s44, 16 +; SI-NEXT: s_and_b32 s76, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s28, s46, 16 +; SI-NEXT: s_and_b32 s74, s56, 0xffff0000 +; SI-NEXT: s_lshl_b32 s26, s56, 16 +; SI-NEXT: s_and_b32 s72, s58, 0xffff0000 +; SI-NEXT: s_lshl_b32 s24, s58, 16 +; SI-NEXT: s_and_b32 s61, vcc_hi, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, vcc_hi, 16 +; SI-NEXT: s_and_b32 s17, vcc_lo, 0xffff0000 +; SI-NEXT: s_lshl_b32 s20, vcc_lo, 16 +; SI-NEXT: s_and_b32 s60, s5, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s5, 16 +; SI-NEXT: s_and_b32 s5, s4, 0xffff0000 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_writelane_b32 v41, s25, 37 ; SI-NEXT: .LBB107_5: ; %end -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s86 -; SI-NEXT: v_readlane_b32 s4, v41, 25 +; SI-NEXT: v_readlane_b32 s6, v41, 32 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 31 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_readlane_b32 s6, v41, 34 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s96 -; SI-NEXT: v_readlane_b32 s4, v41, 26 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 33 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 36 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 -; SI-NEXT: v_readlane_b32 s4, v41, 27 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; SI-NEXT: v_readlane_b32 s6, v41, 35 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: v_readlane_b32 s6, v41, 37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 -; SI-NEXT: v_readlane_b32 s4, v41, 28 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 -; SI-NEXT: v_readlane_b32 s4, v41, 29 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 -; SI-NEXT: v_readlane_b32 s4, v41, 30 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s98 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 -; SI-NEXT: v_readlane_b32 s4, v41, 31 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s62 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s93 -; SI-NEXT: v_readlane_b32 s4, v41, 32 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s85 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s96 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 -; SI-NEXT: v_readlane_b32 s4, v41, 33 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s81 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s99 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s31 -; SI-NEXT: v_readlane_b32 s4, v41, 34 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s70 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s86 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 -; SI-NEXT: v_readlane_b32 s4, v41, 35 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 -; SI-NEXT: v_readlane_b32 s4, v41, 36 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s4 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s69 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s67 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s38 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s65 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s24 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s39 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s28 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s48 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s48 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s49 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s37 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s39 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s89 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s50 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s36 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s38 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s91 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s90 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s51 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s34 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s95 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s70 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s52 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s30 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s93 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s71 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s53 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s94 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s91 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s20 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s92 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s88 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s81 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s55 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s90 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s78 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s63 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s82 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s72 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s74 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s67 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s61 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s75 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s76 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s69 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s60 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s85 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s5 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v40, 35 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 3e96ab1d597d6..5800414be7476 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -2439,14 +2439,14 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2458,14 +2458,18 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB22_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 @@ -2477,30 +2481,34 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2513,12 +2521,12 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -2529,15 +2537,15 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -2547,15 +2555,15 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -2565,15 +2573,15 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -2583,9 +2591,9 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2848,50 +2856,58 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2906,7 +2922,7 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -2914,7 +2930,7 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -2922,17 +2938,17 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 @@ -2940,17 +2956,17 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -2958,15 +2974,15 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -2976,9 +2992,9 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -6897,14 +6913,14 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -6916,14 +6932,18 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 @@ -6935,30 +6955,34 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6971,12 +6995,12 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -6987,15 +7011,15 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -7005,15 +7029,15 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -7023,15 +7047,15 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -7041,9 +7065,9 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7306,50 +7330,58 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -7364,7 +7396,7 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -7372,7 +7404,7 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -7380,17 +7412,17 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 @@ -7398,17 +7430,17 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -7416,15 +7448,15 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -7434,9 +7466,9 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -11003,14 +11035,14 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11022,49 +11054,57 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB66_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -11077,12 +11117,12 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -11093,15 +11133,15 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -11111,15 +11151,15 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -11129,15 +11169,15 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -11147,9 +11187,9 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -11412,50 +11452,58 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -11470,7 +11518,7 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -11478,7 +11526,7 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -11486,17 +11534,17 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 @@ -11504,17 +11552,17 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -11522,15 +11570,15 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -11540,9 +11588,9 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -14695,14 +14743,14 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -14714,14 +14762,18 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB82_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v13, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr11 @@ -14733,30 +14785,34 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14769,12 +14825,12 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -14785,15 +14841,15 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -14803,15 +14859,15 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -14821,15 +14877,15 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -14839,9 +14895,9 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -15104,50 +15160,58 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v12 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -15162,7 +15226,7 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -15170,7 +15234,7 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -15178,17 +15242,17 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 @@ -15196,17 +15260,17 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -15214,15 +15278,15 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -15232,9 +15296,9 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -18024,32 +18088,36 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: .LBB94_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v4, v2, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -18129,14 +18197,14 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v7, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -18426,32 +18494,36 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v4, v2, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: .LBB95_3: ; %end @@ -18489,12 +18561,11 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 @@ -18506,12 +18577,11 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -18523,12 +18593,11 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 @@ -18543,11 +18612,14 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -21043,14 +21115,14 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v7, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -21411,12 +21483,11 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 @@ -21428,12 +21499,11 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -21445,12 +21515,11 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; VI-NEXT: v_bfe_u32 v7, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 @@ -21465,11 +21534,14 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; VI-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v6, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -23438,14 +23510,14 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -23472,59 +23544,71 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB108_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v20, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v18, 16 -; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v8, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v12, v5, v6 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 ; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 ; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 ; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: .LBB108_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 -; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -23578,91 +23662,95 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v19 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v19, v1, v0, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v18, v1, v0, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; VI-NEXT: v_or_b32_e32 v1, v19, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v17, v1, v0, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v16, v1, v0, 16 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v17 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; VI-NEXT: v_or_b32_e32 v6, v17, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v10, v4, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_or_b32_e32 v0, v18, v0 +; VI-NEXT: v_or_b32_e32 v5, v16, v3 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v18 @@ -24090,60 +24178,72 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_alignbit_b32 v0, v0, v19, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v17, 16 -; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v19 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v8, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v12, v5, v6 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 ; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 ; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 ; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v20 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -24181,142 +24281,144 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s20, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s10, s19, 24 -; VI-NEXT: s_lshr_b32 s11, s19, 16 -; VI-NEXT: s_lshr_b32 s13, s19, 8 -; VI-NEXT: s_lshr_b32 s12, s18, 16 -; VI-NEXT: s_lshr_b32 s14, s18, 8 +; VI-NEXT: s_lshr_b32 s22, s19, 24 +; VI-NEXT: s_lshr_b32 s21, s19, 16 +; VI-NEXT: s_lshr_b32 s11, s19, 8 +; VI-NEXT: s_lshr_b32 s23, s18, 16 +; VI-NEXT: s_lshr_b32 s13, s18, 8 ; VI-NEXT: s_lshr_b32 s15, s17, 24 -; VI-NEXT: s_lshr_b32 s20, s17, 16 -; VI-NEXT: s_lshr_b32 s22, s17, 8 -; VI-NEXT: s_lshr_b32 s21, s16, 16 -; VI-NEXT: s_lshr_b32 s23, s16, 8 +; VI-NEXT: s_lshr_b32 s14, s17, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s20, s16, 16 +; VI-NEXT: s_lshr_b32 s12, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v18, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v18, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_or_b32_e32 v2, v17, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v19, v1, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v17, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v7, v5, v6, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v3 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_or_b32_e32 v6, v16, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v10, v5, v8, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v16, v0, v1, 16 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v5, v8, v4 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr23 -; VI-NEXT: ; implicit-def: $sgpr21 -; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr22 +; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr20 -; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr12 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr15 ; VI-NEXT: ; implicit-def: $sgpr13 +; VI-NEXT: ; implicit-def: $sgpr23 +; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr21 +; VI-NEXT: ; implicit-def: $sgpr22 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v18, s16 -; VI-NEXT: v_mov_b32_e32 v19, s17 -; VI-NEXT: v_mov_b32_e32 v16, s18 -; VI-NEXT: v_mov_b32_e32 v17, s19 -; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: v_mov_b32_e32 v2, s21 -; VI-NEXT: v_mov_b32_e32 v5, s22 -; VI-NEXT: v_mov_b32_e32 v6, s20 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v10, s23 +; VI-NEXT: v_mov_b32_e32 v16, s19 +; VI-NEXT: v_mov_b32_e32 v15, s22 +; VI-NEXT: v_mov_b32_e32 v14, s21 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s20 +; VI-NEXT: v_mov_b32_e32 v17, s17 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: v_mov_b32_e32 v9, s14 -; VI-NEXT: v_mov_b32_e32 v10, s12 -; VI-NEXT: v_mov_b32_e32 v13, s13 -; VI-NEXT: v_mov_b32_e32 v14, s11 -; VI-NEXT: v_mov_b32_e32 v15, s10 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v9, s13 +; VI-NEXT: v_mov_b32_e32 v13, s11 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v5, s10 ; VI-NEXT: v_mov_b32_e32 v11, s6 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v18 -; VI-NEXT: v_mov_b32_e32 v4, v19 -; VI-NEXT: v_mov_b32_e32 v8, v16 -; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: v_mov_b32_e32 v4, v17 +; VI-NEXT: v_mov_b32_e32 v12, v16 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v8bf16_to_v16i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index f8ffaa456c2b3..599b1d6336ec3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -3327,22 +3327,22 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3354,89 +3354,105 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB22_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3449,12 +3465,12 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -3465,15 +3481,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -3483,15 +3499,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -3501,15 +3517,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -3519,15 +3535,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -3537,15 +3553,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -3555,15 +3571,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -3573,15 +3589,15 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -3591,9 +3607,9 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4054,90 +4070,106 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -4152,7 +4184,7 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -4160,7 +4192,7 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -4168,17 +4200,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -4186,17 +4218,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -4204,17 +4236,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -4222,17 +4254,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -4240,17 +4272,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 @@ -4258,17 +4290,17 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 @@ -4276,15 +4308,15 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -4294,9 +4326,9 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -10340,22 +10372,22 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10367,89 +10399,105 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -10462,12 +10510,12 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -10478,15 +10526,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -10496,15 +10544,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -10514,15 +10562,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -10532,15 +10580,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -10550,15 +10598,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -10568,15 +10616,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -10586,15 +10634,15 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -10604,9 +10652,9 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -11067,90 +11115,106 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -11165,7 +11229,7 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -11173,7 +11237,7 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -11181,17 +11245,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -11199,17 +11263,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -11217,17 +11281,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -11235,17 +11299,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -11253,17 +11317,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 @@ -11271,17 +11335,17 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 @@ -11289,15 +11353,15 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -11307,9 +11371,9 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -16917,22 +16981,22 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -16944,89 +17008,105 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB66_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17039,12 +17119,12 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -17055,15 +17135,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -17073,15 +17153,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -17091,15 +17171,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -17109,15 +17189,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -17127,15 +17207,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -17145,15 +17225,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -17163,15 +17243,15 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -17181,9 +17261,9 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -17644,90 +17724,106 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -17742,7 +17838,7 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -17750,7 +17846,7 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -17758,17 +17854,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -17776,17 +17872,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -17794,17 +17890,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -17812,17 +17908,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -17830,17 +17926,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 @@ -17848,17 +17944,17 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 @@ -17866,15 +17962,15 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -17884,9 +17980,9 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -22955,22 +23051,22 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -22982,89 +23078,105 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB82_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v27, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v25, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v23, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v21, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v19, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23077,12 +23189,12 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -23093,15 +23205,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -23111,15 +23223,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -23129,15 +23241,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -23147,15 +23259,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -23165,15 +23277,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -23183,15 +23295,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -23201,15 +23313,15 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -23219,9 +23331,9 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -23682,90 +23794,106 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v23, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v21, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v23, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v21, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v15, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v11, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v24 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -23780,7 +23908,7 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -23788,7 +23916,7 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -23796,17 +23924,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -23814,17 +23942,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -23832,17 +23960,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -23850,17 +23978,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -23868,17 +23996,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v8, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v2, 16, 1 @@ -23886,17 +24014,17 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v8, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v1 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 ; VI-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 @@ -23904,15 +24032,15 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v8, s4, v0 ; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -23922,9 +24050,9 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 ; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -28535,58 +28663,66 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: .LBB94_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v4, v2, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v8, v2, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v12, v2, v13 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 @@ -28728,26 +28864,26 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc ; VI-NEXT: v_bfe_u32 v16, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v7 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_add_u32_e32 v16, vcc, s6, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v15, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v14, 16 -; VI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v12, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v11, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -29257,58 +29393,66 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v4, v2, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v8, v2, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v12, v2, v13 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 @@ -29356,83 +29500,82 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v1 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v1 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: s_lshl_b32 s5, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s5, v1 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc ; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc ; VI-NEXT: v_add_f32_e32 v7, s5, v1 ; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 @@ -29449,44 +29592,45 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v13, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc ; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; VI-NEXT: v_or_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; VI-NEXT: v_or_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -33634,26 +33778,26 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc ; VI-NEXT: v_bfe_u32 v16, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v7 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_add_u32_e32 v16, vcc, s6, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v15, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v14, 16 -; VI-NEXT: v_alignbit_b32 v5, v5, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v12, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v11, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v14, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -34277,83 +34421,82 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v1 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v3, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v1 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_add_f32_e32 v4, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v12, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: s_lshl_b32 s5, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_add_f32_e32 v5, s5, v1 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_add_f32_e32 v5, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v10, v6, v7, vcc -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s4, v1 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: s_lshl_b32 s5, s22, 16 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_add_f32_e32 v6, s5, v1 -; VI-NEXT: v_cndmask_b32_e32 v11, v7, v11, vcc ; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: s_and_b32 s5, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc -; VI-NEXT: v_add_f32_e32 v7, s5, v1 -; VI-NEXT: v_bfe_u32 v12, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v7 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc ; VI-NEXT: s_lshl_b32 s5, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v13, vcc ; VI-NEXT: v_add_f32_e32 v7, s5, v1 ; VI-NEXT: v_bfe_u32 v13, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v7 @@ -34370,44 +34513,45 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v6, v12, v6, 16 -; VI-NEXT: v_add_f32_e32 v12, s4, v1 -; VI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_bfe_u32 v13, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v5 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s4, v1 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v13, v13, v14, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc ; VI-NEXT: v_bfe_u32 v14, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v1 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 ; VI-NEXT: v_or_b32_e32 v15, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v14, v15, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v5, v1, v13, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v11, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v10, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; VI-NEXT: v_alignbit_b32 v1, v15, v8, 16 -; VI-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; VI-NEXT: v_or_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; VI-NEXT: v_or_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -37698,22 +37842,22 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v15 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -37756,22 +37900,30 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB108_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; SI-NEXT: v_alignbit_b32 v8, v5, v48, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 -; SI-NEXT: v_alignbit_b32 v16, v5, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 -; SI-NEXT: v_alignbit_b32 v0, v0, v36, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v34, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v38, 16 -; SI-NEXT: v_alignbit_b32 v20, v22, v50, 16 -; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 -; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; SI-NEXT: v_or_b32_e32 v8, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v12, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v16, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_or_b32_e32 v20, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_or_b32_e32 v24, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v49 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v28, v5, v6 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -37784,81 +37936,97 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 ; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 ; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: .LBB108_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 -; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 -; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -37885,10 +38053,10 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-LABEL: bitcast_v16bf16_to_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v5 -; VI-NEXT: v_mov_b32_e32 v32, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v33, v3 +; VI-NEXT: v_mov_b32_e32 v32, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr2 @@ -37923,39 +38091,39 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v32 ; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 ; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[32:33] ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 @@ -37963,165 +38131,173 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_or_b32_e32 v3, v1, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v10, v5, v9, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v9, v11, vcc +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: v_or_b32_e32 v5, v33, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v9, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v24, v9, v11, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v15, v11, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v11 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v34 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v18, v12, v13, vcc +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v15 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_or_b32_e32 v12, v35, v9 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v11, v34, v9 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v22, v13, v14, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v13, vcc +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v13, v14, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v9, v13, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v22 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v35, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v34, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v33, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v32, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v24 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_e32 v14, v7, v9 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v2, v0, v2 +; VI-NEXT: v_or_b32_e32 v4, v32, v4 +; VI-NEXT: v_or_b32_e32 v13, v6, v9 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 ; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v8, v34 -; VI-NEXT: v_mov_b32_e32 v12, v35 -; VI-NEXT: v_mov_b32_e32 v16, v32 -; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v12, v33 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 ; VI-NEXT: v_mov_b32_e32 v24, v6 ; VI-NEXT: v_mov_b32_e32 v28, v7 ; VI-NEXT: v_mov_b32_e32 v1, v38 @@ -38871,40 +39047,48 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v1 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s29 ; SI-NEXT: v_mul_f32_e32 v53, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_alignbit_b32 v8, v5, v39, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36 -; SI-NEXT: v_alignbit_b32 v16, v5, v51, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v52 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v33, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v37, 16 -; SI-NEXT: v_alignbit_b32 v20, v22, v49, 16 -; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 -; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v8, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 +; SI-NEXT: v_or_b32_e32 v12, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 +; SI-NEXT: v_or_b32_e32 v16, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_or_b32_e32 v20, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_or_b32_e32 v24, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v28, v5, v6 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -38917,64 +39101,80 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 ; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 ; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36 ; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48 ; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v52 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v52 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v52 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v24, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v28, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v16, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v20, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v12, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -39038,270 +39238,274 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; VI-NEXT: s_cmp_lg_u32 s24, 0 ; VI-NEXT: s_cbranch_scc0 .LBB109_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s14, s23, 24 -; VI-NEXT: s_lshr_b32 s15, s23, 16 +; VI-NEXT: s_lshr_b32 s58, s23, 24 +; VI-NEXT: s_lshr_b32 s57, s23, 16 ; VI-NEXT: s_lshr_b32 s25, s23, 8 -; VI-NEXT: s_lshr_b32 s24, s22, 16 -; VI-NEXT: s_lshr_b32 s26, s22, 8 -; VI-NEXT: s_lshr_b32 s27, s21, 24 -; VI-NEXT: s_lshr_b32 s28, s21, 16 -; VI-NEXT: s_lshr_b32 s40, s21, 8 -; VI-NEXT: s_lshr_b32 s29, s20, 16 -; VI-NEXT: s_lshr_b32 s41, s20, 8 -; VI-NEXT: s_lshr_b32 s42, s19, 24 +; VI-NEXT: s_lshr_b32 s59, s22, 16 +; VI-NEXT: s_lshr_b32 s29, s22, 8 +; VI-NEXT: s_lshr_b32 s47, s21, 24 +; VI-NEXT: s_lshr_b32 s46, s21, 16 +; VI-NEXT: s_lshr_b32 s24, s21, 8 +; VI-NEXT: s_lshr_b32 s56, s20, 16 +; VI-NEXT: s_lshr_b32 s28, s20, 8 +; VI-NEXT: s_lshr_b32 s44, s19, 24 ; VI-NEXT: s_lshr_b32 s43, s19, 16 -; VI-NEXT: s_lshr_b32 s45, s19, 8 -; VI-NEXT: s_lshr_b32 s44, s18, 16 -; VI-NEXT: s_lshr_b32 s46, s18, 8 -; VI-NEXT: s_lshr_b32 s47, s17, 24 -; VI-NEXT: s_lshr_b32 s56, s17, 16 -; VI-NEXT: s_lshr_b32 s58, s17, 8 -; VI-NEXT: s_lshr_b32 s57, s16, 16 -; VI-NEXT: s_lshr_b32 s59, s16, 8 +; VI-NEXT: s_lshr_b32 s15, s19, 8 +; VI-NEXT: s_lshr_b32 s45, s18, 16 +; VI-NEXT: s_lshr_b32 s27, s18, 8 +; VI-NEXT: s_lshr_b32 s41, s17, 24 +; VI-NEXT: s_lshr_b32 s40, s17, 16 +; VI-NEXT: s_lshr_b32 s14, s17, 8 +; VI-NEXT: s_lshr_b32 s42, s16, 16 +; VI-NEXT: s_lshr_b32 s26, s16, 8 ; VI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 ; VI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v2, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v0, s4, v2 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; VI-NEXT: v_add_f32_e32 v1, s4, v2 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v3, v35, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; VI-NEXT: v_add_f32_e32 v0, s4, v2 -; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_f32_e32 v0, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v36, v2, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v1 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v9, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s19, 16 +; VI-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc +; VI-NEXT: v_add_f32_e32 v5, s4, v1 +; VI-NEXT: v_bfe_u32 v8, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v5 ; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_or_b32_e32 v5, v34, v4 +; VI-NEXT: v_add_f32_e32 v4, s4, v1 +; VI-NEXT: v_bfe_u32 v8, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v8, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_cndmask_b32_e32 v10, v8, v9, vcc +; VI-NEXT: v_add_f32_e32 v8, s4, v1 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v17, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc +; VI-NEXT: v_add_f32_e32 v9, s4, v1 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s21, 16 +; VI-NEXT: v_cndmask_b32_e32 v15, v11, v12, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v1 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v16, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_or_b32_e32 v12, v33, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v1 +; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: s_lshl_b32 s4, s20, 16 +; VI-NEXT: v_cndmask_b32_e32 v18, v11, v13, vcc +; VI-NEXT: v_add_f32_e32 v11, s4, v1 +; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v18 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 ; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s4, v2 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v25, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s4, v2 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v11, v16, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v1 +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: s_lshl_b32 s4, s23, 16 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v9, s4, v1 +; VI-NEXT: v_cndmask_b32_e32 v22, v13, v14, vcc +; VI-NEXT: v_bfe_u32 v13, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v9 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_add_f32_e32 v2, s4, v2 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v24, v2, v3, 16 -; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[24:25] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[16:17] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v24 -; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v0 +; VI-NEXT: v_cndmask_b32_e32 v9, v13, v14, vcc +; VI-NEXT: v_add_f32_e32 v13, s4, v1 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: s_lshl_b32 s4, s22, 16 +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_bfe_u32 v13, v1, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v23, v14, v17, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v1 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v13, v14, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_or_b32_e32 v14, v32, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v2, v0, v2 +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_or_b32_e32 v13, v24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v15 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 ; VI-NEXT: s_branch .LBB109_5 ; VI-NEXT: .LBB109_3: -; VI-NEXT: ; implicit-def: $sgpr59 -; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr42 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr56 -; VI-NEXT: ; implicit-def: $sgpr47 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr44 -; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr41 +; VI-NEXT: ; implicit-def: $sgpr27 ; VI-NEXT: ; implicit-def: $sgpr45 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr15 ; VI-NEXT: ; implicit-def: $sgpr43 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr41 -; VI-NEXT: ; implicit-def: $sgpr29 -; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr28 -; VI-NEXT: ; implicit-def: $sgpr27 -; VI-NEXT: ; implicit-def: $sgpr26 +; VI-NEXT: ; implicit-def: $sgpr56 +; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr24 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr47 +; VI-NEXT: ; implicit-def: $sgpr29 +; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr25 -; VI-NEXT: ; implicit-def: $sgpr15 -; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr57 +; VI-NEXT: ; implicit-def: $sgpr58 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v9, s19 -; VI-NEXT: v_mov_b32_e32 v16, s20 -; VI-NEXT: v_mov_b32_e32 v17, s21 ; VI-NEXT: v_mov_b32_e32 v24, s22 -; VI-NEXT: v_mov_b32_e32 v25, s23 -; VI-NEXT: v_mov_b32_e32 v35, s59 -; VI-NEXT: v_mov_b32_e32 v2, s57 -; VI-NEXT: v_mov_b32_e32 v5, s58 -; VI-NEXT: v_mov_b32_e32 v6, s56 -; VI-NEXT: v_mov_b32_e32 v7, s47 -; VI-NEXT: v_mov_b32_e32 v34, s46 -; VI-NEXT: v_mov_b32_e32 v10, s44 -; VI-NEXT: v_mov_b32_e32 v13, s45 +; VI-NEXT: v_mov_b32_e32 v26, s59 +; VI-NEXT: v_mov_b32_e32 v32, s23 +; VI-NEXT: v_mov_b32_e32 v31, s58 +; VI-NEXT: v_mov_b32_e32 v30, s57 +; VI-NEXT: v_mov_b32_e32 v16, s20 +; VI-NEXT: v_mov_b32_e32 v18, s56 +; VI-NEXT: v_mov_b32_e32 v33, s21 +; VI-NEXT: v_mov_b32_e32 v23, s47 +; VI-NEXT: v_mov_b32_e32 v22, s46 +; VI-NEXT: v_mov_b32_e32 v8, s18 +; VI-NEXT: v_mov_b32_e32 v10, s45 +; VI-NEXT: v_mov_b32_e32 v34, s19 +; VI-NEXT: v_mov_b32_e32 v15, s44 ; VI-NEXT: v_mov_b32_e32 v14, s43 -; VI-NEXT: v_mov_b32_e32 v15, s42 -; VI-NEXT: v_mov_b32_e32 v33, s41 -; VI-NEXT: v_mov_b32_e32 v18, s29 -; VI-NEXT: v_mov_b32_e32 v21, s40 -; VI-NEXT: v_mov_b32_e32 v22, s28 -; VI-NEXT: v_mov_b32_e32 v23, s27 -; VI-NEXT: v_mov_b32_e32 v32, s26 -; VI-NEXT: v_mov_b32_e32 v26, s24 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s42 +; VI-NEXT: v_mov_b32_e32 v35, s17 +; VI-NEXT: v_mov_b32_e32 v7, s41 +; VI-NEXT: v_mov_b32_e32 v6, s40 +; VI-NEXT: v_mov_b32_e32 v25, s29 ; VI-NEXT: v_mov_b32_e32 v29, s25 -; VI-NEXT: v_mov_b32_e32 v30, s15 -; VI-NEXT: v_mov_b32_e32 v31, s14 +; VI-NEXT: v_mov_b32_e32 v17, s28 +; VI-NEXT: v_mov_b32_e32 v21, s24 +; VI-NEXT: v_mov_b32_e32 v9, s27 +; VI-NEXT: v_mov_b32_e32 v13, s15 +; VI-NEXT: v_mov_b32_e32 v1, s26 +; VI-NEXT: v_mov_b32_e32 v5, s14 ; VI-NEXT: v_mov_b32_e32 v27, s10 ; VI-NEXT: v_mov_b32_e32 v19, s8 ; VI-NEXT: v_mov_b32_e32 v11, s6 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_mov_b32_e32 v4, v1 -; VI-NEXT: v_mov_b32_e32 v12, v9 -; VI-NEXT: v_mov_b32_e32 v20, v17 -; VI-NEXT: v_mov_b32_e32 v28, v25 -; VI-NEXT: v_mov_b32_e32 v1, v35 -; VI-NEXT: v_mov_b32_e32 v9, v34 -; VI-NEXT: v_mov_b32_e32 v17, v33 -; VI-NEXT: v_mov_b32_e32 v25, v32 +; VI-NEXT: v_mov_b32_e32 v4, v35 +; VI-NEXT: v_mov_b32_e32 v12, v34 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v28, v32 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16bf16_to_v32i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 48c9b8775a474..cc427eb9326a9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1210,8 +1210,8 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1223,19 +1223,21 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB14_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: .LBB14_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1248,12 +1250,12 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB14_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -1263,9 +1265,9 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB14_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1386,20 +1388,22 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: .LBB15_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: @@ -1414,13 +1418,13 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB15_4 ; VI-NEXT: .LBB15_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -1430,9 +1434,9 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB15_3: ; VI-NEXT: s_branch .LBB15_2 @@ -3454,8 +3458,8 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3467,19 +3471,21 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB34_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB34_2 ; SI-NEXT: .LBB34_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3492,12 +3498,12 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB34_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -3507,9 +3513,9 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB34_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3630,20 +3636,22 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: .LBB35_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB35_4: @@ -3658,13 +3666,13 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB35_4 ; VI-NEXT: .LBB35_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -3674,9 +3682,9 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB35_3: ; VI-NEXT: s_branch .LBB35_2 @@ -5356,12 +5364,14 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB50_2 ; SI-NEXT: .LBB50_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5390,8 +5400,8 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5522,12 +5532,14 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: .LBB51_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: @@ -5560,8 +5572,8 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_3: ; VI-NEXT: s_branch .LBB51_2 @@ -7024,8 +7036,8 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB62_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7196,8 +7208,8 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB63_3: ; VI-NEXT: s_branch .LBB63_2 @@ -8411,8 +8423,8 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8424,19 +8436,21 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB72_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_2 ; SI-NEXT: .LBB72_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8449,12 +8463,12 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB72_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -8464,9 +8478,9 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB72_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -8587,20 +8601,22 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: s_cbranch_execnz .LBB73_3 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: .LBB73_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB73_4: @@ -8615,13 +8631,13 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB73_4 ; VI-NEXT: .LBB73_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -8631,9 +8647,9 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB73_3: ; VI-NEXT: s_branch .LBB73_2 @@ -8913,8 +8929,8 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -8929,21 +8945,25 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB76_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_2 ; SI-NEXT: .LBB76_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -8972,27 +8992,28 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB76_2 ; VI-NEXT: .LBB76_4: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9163,22 +9184,26 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s18, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: s_cbranch_scc0 .LBB77_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: s_cbranch_execnz .LBB77_3 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; SI-NEXT: .LBB77_3: ; %end @@ -9196,44 +9221,45 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32 ; VI-NEXT: s_cmp_lg_u32 s17, 0 ; VI-NEXT: s_cbranch_scc0 .LBB77_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s7, s16, 24 -; VI-NEXT: s_lshr_b32 s6, s16, 16 -; VI-NEXT: s_lshr_b32 s8, s16, 8 +; VI-NEXT: s_lshr_b32 s6, s16, 24 +; VI-NEXT: s_lshr_b32 s8, s16, 16 +; VI-NEXT: s_lshr_b32 s7, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB77_4 ; VI-NEXT: .LBB77_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB77_3: +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: ; implicit-def: $sgpr6 -; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB77_2 ; VI-NEXT: .LBB77_4: -; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index 68312b89142c7..2db07ae9626a8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -88,8 +88,8 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB0_2: ; %end @@ -309,8 +309,8 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] @@ -691,16 +691,16 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB4_2 ; SI-NEXT: .LBB4_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -739,8 +739,8 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB4_2: ; %end @@ -907,16 +907,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] @@ -959,8 +959,8 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 5aac06a7f3a2b..bc0c0158aff29 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -5095,37 +5095,37 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 @@ -5135,170 +5135,202 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB22_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB22_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -5322,12 +5354,12 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -5338,15 +5370,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -5356,15 +5388,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -5374,15 +5406,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -5392,15 +5424,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -5410,15 +5442,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -5428,15 +5460,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -5446,15 +5478,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -5464,15 +5496,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -5482,15 +5514,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -5500,15 +5532,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -5518,15 +5550,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -5536,15 +5568,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -5554,15 +5586,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -5572,15 +5604,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -5590,15 +5622,15 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -5608,9 +5640,9 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6468,171 +6500,208 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -6654,7 +6723,7 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -6662,7 +6731,7 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6670,7 +6739,7 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 @@ -6678,27 +6747,27 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s29, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6706,17 +6775,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s28, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6724,17 +6793,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s27, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6742,17 +6811,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6760,17 +6829,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s25, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6778,17 +6847,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s24, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6796,17 +6865,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6814,17 +6883,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6832,17 +6901,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6850,17 +6919,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6868,17 +6937,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -6886,17 +6955,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 @@ -6904,17 +6973,17 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 @@ -6922,15 +6991,15 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -6940,9 +7009,9 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB23_5 ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -19915,37 +19984,37 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 @@ -19955,170 +20024,202 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB46_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB46_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -20142,12 +20243,12 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -20158,15 +20259,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -20176,15 +20277,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -20194,15 +20295,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -20212,15 +20313,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -20230,15 +20331,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -20248,15 +20349,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -20266,15 +20367,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -20284,15 +20385,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -20302,15 +20403,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -20320,15 +20421,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -20338,15 +20439,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -20356,15 +20457,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -20374,15 +20475,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -20392,15 +20493,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -20410,15 +20511,15 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -20428,9 +20529,9 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -21288,171 +21389,208 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -21474,7 +21612,7 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -21482,7 +21620,7 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21490,7 +21628,7 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 @@ -21498,27 +21636,27 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s29, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21526,17 +21664,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s28, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21544,17 +21682,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s27, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21562,17 +21700,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21580,17 +21718,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s25, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21598,17 +21736,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s24, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21616,17 +21754,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21634,17 +21772,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21652,17 +21790,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21670,17 +21808,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21688,17 +21826,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -21706,17 +21844,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 @@ -21724,17 +21862,17 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 @@ -21742,15 +21880,15 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -21760,9 +21898,9 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB47_5 ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -34243,37 +34381,37 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 @@ -34283,170 +34421,202 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB66_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB66_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -34470,12 +34640,12 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -34486,15 +34656,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -34504,15 +34674,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -34522,15 +34692,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -34540,15 +34710,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -34558,15 +34728,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -34576,15 +34746,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -34594,15 +34764,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -34612,15 +34782,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -34630,15 +34800,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -34648,15 +34818,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -34666,15 +34836,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -34684,15 +34854,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -34702,15 +34872,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -34720,15 +34890,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -34738,15 +34908,15 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -34756,9 +34926,9 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -35616,171 +35786,208 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB67_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -35802,7 +36009,7 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -35810,7 +36017,7 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35818,7 +36025,7 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 @@ -35826,27 +36033,27 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s29, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35854,17 +36061,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s28, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35872,17 +36079,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s27, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35890,17 +36097,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35908,17 +36115,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s25, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35926,17 +36133,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s24, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35944,17 +36151,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35962,17 +36169,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35980,17 +36187,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -35998,17 +36205,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -36016,17 +36223,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -36034,17 +36241,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 @@ -36052,17 +36259,17 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 @@ -36070,15 +36277,15 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -36088,9 +36295,9 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB67_5 ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -47646,37 +47853,37 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v50, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v38, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v17 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v21 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v23 ; SI-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v25 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 @@ -47686,170 +47893,202 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; SI-NEXT: v_alignbit_b32 v0, v0, v46, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v44, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v40, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v54, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v52, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v48, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v38, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v36, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v34, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v32, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v18, 16 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v54 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v48 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v36 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v32 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v23 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v21 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: .LBB82_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v45 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v32 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB82_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -47873,12 +48112,12 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 @@ -47889,15 +48128,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 @@ -47907,15 +48146,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 @@ -47925,15 +48164,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 @@ -47943,15 +48182,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 @@ -47961,15 +48200,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 @@ -47979,15 +48218,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 @@ -47997,15 +48236,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 @@ -48015,15 +48254,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 @@ -48033,15 +48272,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 @@ -48051,15 +48290,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -48069,15 +48308,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -48087,15 +48326,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 @@ -48105,15 +48344,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -48123,15 +48362,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -48141,15 +48380,15 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 -; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -48159,9 +48398,9 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -49019,171 +49258,208 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v55, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v53, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v49, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v48, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v39, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v37, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v51, 16 -; SI-NEXT: v_alignbit_b32 v4, v4, v49, 16 -; SI-NEXT: v_alignbit_b32 v5, v5, v39, 16 -; SI-NEXT: v_alignbit_b32 v6, v6, v37, 16 -; SI-NEXT: v_alignbit_b32 v7, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v8, v8, v31, 16 -; SI-NEXT: v_alignbit_b32 v9, v9, v29, 16 -; SI-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; SI-NEXT: v_alignbit_b32 v11, v11, v25, 16 -; SI-NEXT: v_alignbit_b32 v12, v12, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v13, v21, 16 -; SI-NEXT: v_alignbit_b32 v14, v14, v19, 16 -; SI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v30 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v28 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v40 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v30 -; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 -; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v26 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v27 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v24 -; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: .LBB83_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -49205,7 +49481,7 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s31, 16 +; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -49213,7 +49489,7 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s31, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s31, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49221,7 +49497,7 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: s_lshl_b32 s4, s30, 16 +; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 @@ -49229,27 +49505,27 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: s_and_b32 s4, s30, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s30, 16 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_add_f32_e32 v4, s4, v0 ; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_alignbit_b32 v14, v1, v3, 16 +; VI-NEXT: v_or_b32_sdwa v15, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v14, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s29, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49257,17 +49533,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_alignbit_b32 v13, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v13, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s28, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s28, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49275,17 +49551,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v12, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s27, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s27, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49293,17 +49569,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_alignbit_b32 v11, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v11, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s26, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s26, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49311,17 +49587,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_alignbit_b32 v10, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v10, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s25, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s25, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49329,17 +49605,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s24, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s24, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49347,17 +49623,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_alignbit_b32 v8, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s23, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s23, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49365,17 +49641,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_alignbit_b32 v7, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v7, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s22, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s22, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49383,17 +49659,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_alignbit_b32 v6, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v6, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s21, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s21, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49401,17 +49677,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_alignbit_b32 v5, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s20, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s20, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49419,17 +49695,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s19, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s19, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -49437,17 +49713,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v16, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 @@ -49455,17 +49731,17 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v16, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v1 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 ; VI-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 @@ -49473,15 +49749,15 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v16, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v16, s4, v0 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -49491,9 +49767,9 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 ; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB83_5 ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -60284,7 +60560,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 @@ -60316,7 +60592,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v52, 1.0, v27 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v28 ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v30 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -60351,7 +60627,7 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v54 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -60387,8 +60663,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v54 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr33 @@ -60419,116 +60695,132 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: .LBB94_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v4, v2, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v8, v2, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v12, v2, v13 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v16, v2, v17 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v20, v2, v21 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v24, v2, v25 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v28, v2, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v30, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v26, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v22, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_or_b32_e32 v18, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 @@ -60824,38 +61116,38 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16 -; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16 -; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16 -; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16 -; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16 -; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v28, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v27, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v24, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -61793,110 +62085,126 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v54 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v4, v2, v5 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v8, v2, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v12, v2, v13 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v16, v2, v17 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v20, v2, v21 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v24, v2, v25 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v28, v2, v29 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v30, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v26, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v22, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 -; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_or_b32_e32 v18, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 +; SI-NEXT: v_or_b32_e32 v14, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v10, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v6, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v32 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 @@ -62012,19 +62320,19 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_add_f32_e32 v5, s5, v1 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; VI-NEXT: v_or_b32_sdwa v14, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_or_b32_sdwa v15, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 @@ -62037,8 +62345,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v13, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62055,8 +62363,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v12, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62073,8 +62381,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v11, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62091,8 +62399,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62109,8 +62417,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62127,8 +62435,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v8, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62145,8 +62453,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v7, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62163,8 +62471,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62181,8 +62489,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62199,8 +62507,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -62217,8 +62525,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 @@ -62235,8 +62543,8 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v16, s4, v1 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 @@ -62260,10 +62568,10 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -73035,38 +73343,38 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16 -; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16 -; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16 -; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16 -; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16 -; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16 -; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v15, v31, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v28, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v27, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v26, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v25, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v24, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -74251,19 +74559,19 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_add_f32_e32 v5, s5, v1 ; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 ; VI-NEXT: s_lshl_b32 s4, s29, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_alignbit_b32 v14, v3, v2, 16 +; VI-NEXT: v_or_b32_sdwa v14, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_alignbit_b32 v15, v5, v4, 16 +; VI-NEXT: v_or_b32_sdwa v15, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: s_and_b32 s4, s29, 0xffff0000 @@ -74276,8 +74584,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s28, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v13, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74294,8 +74602,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s27, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v12, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74312,8 +74620,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s26, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v11, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v11, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74330,8 +74638,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s25, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74348,8 +74656,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s24, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74366,8 +74674,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s23, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v8, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v8, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74384,8 +74692,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s22, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v7, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74402,8 +74710,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s21, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74420,8 +74728,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: s_lshl_b32 s4, s20, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v5, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74438,8 +74746,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc ; VI-NEXT: s_lshl_b32 s4, s19, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -74456,8 +74764,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc ; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v1 ; VI-NEXT: v_bfe_u32 v16, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v2 @@ -74474,8 +74782,8 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v2, v16, v2, 16 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v16, s4, v1 ; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 @@ -74499,10 +74807,10 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg % ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_alignbit_b32 v0, v16, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_branch .LBB103_5 ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -83728,20 +84036,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -83789,604 +84087,593 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v30 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v33, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v30 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v31 -; SI-NEXT: v_alignbit_b32 v48, v1, v38, 16 -; SI-NEXT: v_alignbit_b32 v50, v37, v35, 16 -; SI-NEXT: v_alignbit_b32 v1, v50, v48, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v31 +; SI-NEXT: v_or_b32_e32 v19, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v19, v16, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v48, 16 +; SI-NEXT: v_alignbit_b32 v1, v19, v16, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v50, v48, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_alignbit_b32 v23, v1, v52, 16 -; SI-NEXT: v_alignbit_b32 v21, v19, v49, 16 -; SI-NEXT: v_alignbit_b32 v1, v21, v23, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v21, v23, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v19, v16, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33 -; SI-NEXT: v_alignbit_b32 v17, v1, v55, 16 -; SI-NEXT: v_alignbit_b32 v18, v16, v53, 16 -; SI-NEXT: v_alignbit_b32 v1, v18, v17, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 +; SI-NEXT: v_or_b32_e32 v13, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_or_b32_e32 v14, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v18, v17, 8 +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v34 -; SI-NEXT: v_alignbit_b32 v14, v1, v42, 16 -; SI-NEXT: v_alignbit_b32 v15, v13, v40, 16 -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 24 +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v14, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 -; SI-NEXT: v_alignbit_b32 v11, v1, v45, 16 -; SI-NEXT: v_alignbit_b32 v12, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 +; SI-NEXT: v_or_b32_e32 v9, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v34 +; SI-NEXT: v_or_b32_e32 v10, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_alignbit_b32 v8, v1, v47, 16 -; SI-NEXT: v_alignbit_b32 v9, v7, v24, 16 -; SI-NEXT: v_alignbit_b32 v1, v9, v8, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v40 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 +; SI-NEXT: v_or_b32_e32 v7, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v9, v8, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v8, v7, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 -; SI-NEXT: v_alignbit_b32 v5, v1, v56, 16 -; SI-NEXT: v_alignbit_b32 v6, v4, v28, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 +; SI-NEXT: v_or_b32_e32 v5, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v6, v1, v2 ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v6, v5, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_alignbit_b32 v2, v1, v57, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_alignbit_b32 v3, v1, v29, 16 -; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v18 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v30 -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v34 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 8, v15 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v3, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v39 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v4, v3, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v33 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v2, v2, v15 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; SI-NEXT: v_alignbit_b32 v15, v2, v1, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v20 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v26 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v10 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v34 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v22 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: .LBB108_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v23, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v36 -; SI-NEXT: v_alignbit_b32 v21, v19, v20, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v38 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_alignbit_b32 v48, v30, v20, 16 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_alignbit_b32 v50, v37, v20, 16 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_alignbit_b32 v20, v50, v48, 24 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 -; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v50, v48, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v26 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v18 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v17 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v50, v48, 8 -; SI-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v17 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v21, v23, 24 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 -; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v40 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v21, v23, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v34 -; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v16, v15, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v24 +; SI-NEXT: v_or_b32_e32 v19, v15, v19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v21, v23, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v46 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v18, v17, 24 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v47 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v43 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v18, v17, 16 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 -; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v18, v17, 8 -; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v10 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v15, v14, 24 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v24 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v21 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v15, v14, 16 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 -; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v15, v14, 8 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v7 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v22 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v19, v16, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 -; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v19, v16, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v12, v11, 8 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v4 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v19, v16, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v9, v8, 24 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v25 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v14, v13, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v6, v4, v6, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v14, v13, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v9, v8, 8 -; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v1 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v14, v13, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v6, v5, 24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v12, v11, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v3, v1, v3, 16 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v12, v11, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v6, v5, 8 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v12, v11, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 24 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v10, v9, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v10, v9, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v20, v3, v2, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v10, v9, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v27 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v18 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v9 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v8, v7, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v15 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v6, v5, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v50 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v21 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v28 -; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v12 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v3 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v15, v6, v5, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v6, v5, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v4, v3, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v4, v3, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v4, v3, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v2, v1, 24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v2, v1, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v15, v2, v1, 8 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v24 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: .LBB108_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v63 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v20, v20, v24 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v59 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; SI-NEXT: v_or_b32_e32 v17, v17, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v60 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v60 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v56 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 @@ -84395,58 +84682,91 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v61 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v44 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v62 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v46 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 @@ -84455,52 +84775,89 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v59 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v58 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v43 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v57 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload @@ -84526,6 +84883,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: ; kill: killed $vgpr17 ; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -84542,49 +84903,47 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr22 -; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr44 ; VI-NEXT: ; implicit-def: $vgpr63 ; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr60 -; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: ; implicit-def: $vgpr56 -; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr50 ; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr38 ; VI-NEXT: ; implicit-def: $vgpr52 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: ; implicit-def: $vgpr33 ; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr30 ; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 ; VI-NEXT: ; implicit-def: $vgpr26 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr19 ; VI-NEXT: ; implicit-def: $vgpr18 @@ -84593,71 +84952,72 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v26, v22 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 ; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; VI-NEXT: .LBB108_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v36, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 ; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 @@ -84665,443 +85025,465 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v36 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; VI-NEXT: v_or_b32_e32 v29, v2, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v37, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v37 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; VI-NEXT: v_or_b32_e32 v28, v1, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v34, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v4, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v4 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v4 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; VI-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v34 ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_or_b32_e32 v31, v4, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v35, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v3 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v35 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_alignbit_b32 v3, v3, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; VI-NEXT: v_or_b32_e32 v30, v3, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v59, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v6, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v6 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v6 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 ; VI-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v59 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_alignbit_b32 v6, v6, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; VI-NEXT: v_or_b32_e32 v26, v6, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v61, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v5, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v5 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v5 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; VI-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v61 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v5, v5, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; VI-NEXT: v_or_b32_e32 v25, v5, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v46, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v8, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v8 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v8 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 ; VI-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v46 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: v_alignbit_b32 v8, v8, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; VI-NEXT: v_or_b32_e32 v23, v8, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v56, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v7, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v7 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v7 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 ; VI-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v56 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_alignbit_b32 v7, v7, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; VI-NEXT: v_or_b32_e32 v22, v7, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v41, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v10, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v10 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v10 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 ; VI-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v41 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_alignbit_b32 v10, v10, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; VI-NEXT: v_or_b32_e32 v33, v10, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v43, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v9, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v9 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v9 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 ; VI-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v43 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_alignbit_b32 v9, v9, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v12 +; VI-NEXT: v_or_b32_e32 v32, v9, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v12 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v52, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v12, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v12 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 ; VI-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v52 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: v_alignbit_b32 v12, v12, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; VI-NEXT: v_or_b32_e32 v21, v12, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v11 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v54, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v11, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v11 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; VI-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_alignbit_b32 v11, v11, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; VI-NEXT: v_or_b32_e32 v20, v11, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v51, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v14, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v14 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v14 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 ; VI-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v51 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: v_alignbit_b32 v14, v14, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v13 +; VI-NEXT: v_or_b32_e32 v39, v14, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v40, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; VI-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v40 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: v_alignbit_b32 v13, v13, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; VI-NEXT: v_or_b32_e32 v38, v13, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v45, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 ; VI-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v45 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: v_alignbit_b32 v16, v16, v17, 16 -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_or_b32_e32 v49, v16, v17 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 ; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 -; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_cndmask_b32_e32 v58, v18, v19, vcc ; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 ; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 ; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 ; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v58 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] -; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] -; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v48, v15, v17 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[48:49] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v49 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v48 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v39 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v38 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v20 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v21 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33] +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[22:23] +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[25:26] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[30:31] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[28:29] +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v31 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 +; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v45 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v45 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v58 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v51 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v51 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v40 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v52 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v41 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v41 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v43 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v46 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v56 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v59 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v61 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v34 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v36 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v37 ; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v57 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v24 +; VI-NEXT: v_or_b32_sdwa v22, v37, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v47 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v62 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 ; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v61, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 ; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 ; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 ; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v51 ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v28 +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -86891,596 +87273,651 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 ; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_alignbit_b32 v27, v1, v3, 16 -; SI-NEXT: v_alignbit_b32 v30, v24, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v30, v27, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v15, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v28 +; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v27, 16 +; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v27, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v13, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 +; SI-NEXT: v_or_b32_e32 v14, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_alignbit_b32 v21, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v19, v17, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 24 +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 8 +; SI-NEXT: v_alignbit_b32 v1, v14, v13, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_alignbit_b32 v15, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v7, 16 -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v11, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v22 +; SI-NEXT: v_or_b32_e32 v12, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_alignbit_b32 v10, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v9, v20, 16 -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v12, v11, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v9, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v48 +; SI-NEXT: v_or_b32_e32 v10, v1, v2 +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 8 +; SI-NEXT: v_alignbit_b32 v1, v10, v9, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_or_b32_e32 v7, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v26 +; SI-NEXT: v_or_b32_e32 v5, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v44 +; SI-NEXT: v_or_b32_e32 v6, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 -; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16 -; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16 -; SI-NEXT: v_mov_b32_e32 v31, v23 -; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16 -; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16 -; SI-NEXT: v_mov_b32_e32 v53, v32 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16 -; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_mov_b32_e32 v37, v33 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_mov_b32_e32 v28, v26 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 +; SI-NEXT: v_or_b32_e32 v3, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v17 +; SI-NEXT: v_alignbit_b32 v17, v10, v9, 8 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v17, v4, v3, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v17, v2, v1, 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v17, v2, v1, 8 +; SI-NEXT: v_mov_b32_e32 v33, v31 +; SI-NEXT: v_mov_b32_e32 v32, v30 +; SI-NEXT: v_mov_b32_e32 v31, v29 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v18, v34 +; SI-NEXT: v_alignbit_b32 v51, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v55, v8, v7, 8 +; SI-NEXT: v_alignbit_b32 v34, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v26, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 8 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v35 +; SI-NEXT: v_mov_b32_e32 v21, v35 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v22 +; SI-NEXT: v_mov_b32_e32 v23, v22 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v48 +; SI-NEXT: v_mov_b32_e32 v22, v48 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v40 +; SI-NEXT: v_mov_b32_e32 v24, v40 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; SI-NEXT: v_mov_b32_e32 v33, v56 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48 -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: v_mov_b32_e32 v25, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: v_mov_b32_e32 v25, v44 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v54 +; SI-NEXT: v_mov_b32_e32 v30, v54 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v36 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v36 +; SI-NEXT: v_mov_b32_e32 v36, v20 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 -; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v31 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v52 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v18 +; SI-NEXT: v_alignbit_b32 v27, v4, v3, 8 +; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v44 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v43 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v54 +; SI-NEXT: v_lshrrev_b32_e32 v60, 24, v52 +; SI-NEXT: v_lshrrev_b32_e32 v44, 24, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v2 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v19 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v34, v6, v5, 24 +; SI-NEXT: v_alignbit_b32 v26, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v49, v6, v5, 8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v51, v8, v7, 24 +; SI-NEXT: v_alignbit_b32 v53, v8, v7, 16 +; SI-NEXT: v_alignbit_b32 v55, v8, v7, 8 +; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v22, v10, v9, 24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v10 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59 -; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24 -; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v16, v20 +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v16, v15, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v14, v13, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v12, v11, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v10, v9, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v10, v9, 8 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v4, v3, 24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v4, v3, 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 24 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: v_alignbit_b32 v20, v2, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v27, v36 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xff, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v23 -; SI-NEXT: v_or_b32_e32 v33, v33, v35 -; SI-NEXT: v_or_b32_e32 v27, v27, v33 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v62 -; SI-NEXT: v_or_b32_e32 v27, v27, v30 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v22, v27, v22 -; SI-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v46 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v41 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v23, v18 +; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v50 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v61 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v19 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v63 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v56 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v61 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v41 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v40 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v55 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v51 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v47 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v59 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v52 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v57 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v49 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v34 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v60 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v46 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v45 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v59 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v60 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -87507,813 +87944,866 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: .LBB109_4: ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_mov_b32_e32 v53, v32 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_mov_b32_e32 v39, v38 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v49, v48 +; SI-NEXT: v_mov_b32_e32 v38, v37 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v37, v33 +; SI-NEXT: v_mov_b32_e32 v37, v36 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v33, v56 +; SI-NEXT: v_mov_b32_e32 v18, v34 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: v_mov_b32_e32 v33, v31 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v34, v44 +; SI-NEXT: v_mov_b32_e32 v32, v30 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v31, v23 +; SI-NEXT: v_mov_b32_e32 v31, v29 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v29, v43 +; SI-NEXT: v_mov_b32_e32 v30, v54 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v28, v26 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v25, v44 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_mov_b32_e32 v24, v40 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v23, v41 +; SI-NEXT: v_mov_b32_e32 v23, v22 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v22, v48 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: v_mov_b32_e32 v21, v35 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v63, s30, 0 -; VI-NEXT: v_writelane_b32 v63, s31, 1 -; VI-NEXT: v_writelane_b32 v63, s34, 2 -; VI-NEXT: v_writelane_b32 v63, s35, 3 -; VI-NEXT: v_writelane_b32 v63, s36, 4 -; VI-NEXT: v_writelane_b32 v63, s37, 5 -; VI-NEXT: v_writelane_b32 v63, s38, 6 -; VI-NEXT: v_writelane_b32 v63, s39, 7 -; VI-NEXT: v_writelane_b32 v63, s48, 8 -; VI-NEXT: v_writelane_b32 v63, s49, 9 -; VI-NEXT: v_writelane_b32 v63, s50, 10 -; VI-NEXT: v_writelane_b32 v63, s51, 11 -; VI-NEXT: v_writelane_b32 v63, s52, 12 -; VI-NEXT: v_writelane_b32 v63, s53, 13 -; VI-NEXT: v_writelane_b32 v63, s54, 14 -; VI-NEXT: v_writelane_b32 v63, s55, 15 -; VI-NEXT: v_writelane_b32 v63, s64, 16 -; VI-NEXT: v_writelane_b32 v63, s65, 17 -; VI-NEXT: v_writelane_b32 v63, s66, 18 +; VI-NEXT: v_writelane_b32 v4, s30, 0 +; VI-NEXT: v_writelane_b32 v4, s31, 1 +; VI-NEXT: v_writelane_b32 v4, s34, 2 +; VI-NEXT: v_writelane_b32 v4, s35, 3 +; VI-NEXT: v_writelane_b32 v4, s36, 4 +; VI-NEXT: v_writelane_b32 v4, s37, 5 +; VI-NEXT: v_writelane_b32 v4, s38, 6 +; VI-NEXT: v_writelane_b32 v4, s39, 7 +; VI-NEXT: v_writelane_b32 v4, s48, 8 +; VI-NEXT: v_writelane_b32 v4, s49, 9 +; VI-NEXT: v_writelane_b32 v4, s50, 10 +; VI-NEXT: v_writelane_b32 v4, s51, 11 +; VI-NEXT: v_writelane_b32 v4, s52, 12 +; VI-NEXT: v_writelane_b32 v4, s53, 13 +; VI-NEXT: v_writelane_b32 v4, s54, 14 +; VI-NEXT: v_writelane_b32 v4, s55, 15 +; VI-NEXT: v_writelane_b32 v4, s64, 16 +; VI-NEXT: v_writelane_b32 v4, s65, 17 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; VI-NEXT: v_writelane_b32 v63, s67, 19 +; VI-NEXT: v_writelane_b32 v4, s66, 18 ; VI-NEXT: v_readfirstlane_b32 s4, v1 ; VI-NEXT: s_and_b64 s[6:7], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v2 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB109_3 +; VI-NEXT: v_writelane_b32 v4, s67, 19 +; VI-NEXT: s_cbranch_scc0 .LBB109_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s56, s5, 24 ; VI-NEXT: s_lshr_b32 s57, s5, 16 -; VI-NEXT: s_lshr_b32 s59, s5, 8 +; VI-NEXT: s_lshr_b32 s76, s5, 8 ; VI-NEXT: s_lshr_b32 s58, s4, 16 -; VI-NEXT: s_lshr_b32 s60, s4, 8 -; VI-NEXT: s_lshr_b32 s61, s29, 24 -; VI-NEXT: s_lshr_b32 s62, s29, 16 -; VI-NEXT: s_lshr_b32 s72, s29, 8 -; VI-NEXT: s_lshr_b32 s63, s28, 16 -; VI-NEXT: s_lshr_b32 s73, s28, 8 -; VI-NEXT: s_lshr_b32 s74, s27, 24 -; VI-NEXT: s_lshr_b32 s75, s27, 16 -; VI-NEXT: s_lshr_b32 s77, s27, 8 -; VI-NEXT: s_lshr_b32 s76, s26, 16 -; VI-NEXT: s_lshr_b32 s78, s26, 8 -; VI-NEXT: s_lshr_b32 s79, s25, 24 -; VI-NEXT: s_lshr_b32 s88, s25, 16 -; VI-NEXT: s_lshr_b32 s90, s25, 8 -; VI-NEXT: s_lshr_b32 s89, s24, 16 -; VI-NEXT: s_lshr_b32 s91, s24, 8 -; VI-NEXT: s_lshr_b32 s30, s23, 24 -; VI-NEXT: s_lshr_b32 s31, s23, 16 -; VI-NEXT: s_lshr_b32 s35, s23, 8 -; VI-NEXT: s_lshr_b32 s34, s22, 16 -; VI-NEXT: s_lshr_b32 s36, s22, 8 -; VI-NEXT: s_lshr_b32 s37, s21, 24 -; VI-NEXT: s_lshr_b32 s38, s21, 16 -; VI-NEXT: s_lshr_b32 s48, s21, 8 -; VI-NEXT: s_lshr_b32 s39, s20, 16 -; VI-NEXT: s_lshr_b32 s49, s20, 8 -; VI-NEXT: s_lshr_b32 s50, s19, 24 -; VI-NEXT: s_lshr_b32 s51, s19, 16 -; VI-NEXT: s_lshr_b32 s53, s19, 8 -; VI-NEXT: s_lshr_b32 s52, s18, 16 -; VI-NEXT: s_lshr_b32 s54, s18, 8 -; VI-NEXT: s_lshr_b32 s55, s17, 24 -; VI-NEXT: s_lshr_b32 s64, s17, 16 -; VI-NEXT: s_lshr_b32 s66, s17, 8 -; VI-NEXT: s_lshr_b32 s65, s16, 16 -; VI-NEXT: s_lshr_b32 s67, s16, 8 -; VI-NEXT: s_lshr_b64 s[44:45], s[4:5], 24 -; VI-NEXT: s_lshr_b64 s[42:43], s[28:29], 24 -; VI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 -; VI-NEXT: s_lshr_b64 s[14:15], s[24:25], 24 -; VI-NEXT: s_lshr_b64 s[12:13], s[22:23], 24 -; VI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 -; VI-NEXT: s_lshr_b64 s[8:9], s[18:19], 24 -; VI-NEXT: s_lshr_b64 s[6:7], s[16:17], 24 -; VI-NEXT: s_cbranch_execnz .LBB109_4 +; VI-NEXT: s_lshr_b32 s77, s4, 8 +; VI-NEXT: s_lshr_b32 s59, s29, 24 +; VI-NEXT: s_lshr_b32 s60, s29, 16 +; VI-NEXT: s_lshr_b32 s75, s29, 8 +; VI-NEXT: s_lshr_b32 s61, s28, 16 +; VI-NEXT: s_lshr_b32 s74, s28, 8 +; VI-NEXT: s_lshr_b32 s62, s27, 24 +; VI-NEXT: s_lshr_b32 s78, s27, 16 +; VI-NEXT: s_lshr_b32 s73, s27, 8 +; VI-NEXT: s_lshr_b32 s88, s26, 16 +; VI-NEXT: s_lshr_b32 s72, s26, 8 +; VI-NEXT: s_lshr_b32 s90, s25, 24 +; VI-NEXT: s_lshr_b32 s91, s25, 16 +; VI-NEXT: s_lshr_b32 s63, s25, 8 +; VI-NEXT: s_lshr_b32 s31, s24, 16 +; VI-NEXT: s_lshr_b32 s79, s24, 8 +; VI-NEXT: s_lshr_b32 s35, s23, 24 +; VI-NEXT: s_lshr_b32 s37, s23, 16 +; VI-NEXT: s_lshr_b32 s89, s23, 8 +; VI-NEXT: s_lshr_b32 s39, s22, 16 +; VI-NEXT: s_lshr_b32 s30, s22, 8 +; VI-NEXT: s_lshr_b32 s49, s21, 24 +; VI-NEXT: s_lshr_b32 s50, s21, 16 +; VI-NEXT: s_lshr_b32 s34, s21, 8 +; VI-NEXT: s_lshr_b32 s52, s20, 16 +; VI-NEXT: s_lshr_b32 s36, s20, 8 +; VI-NEXT: s_lshr_b32 s54, s19, 24 +; VI-NEXT: s_lshr_b32 s55, s19, 16 +; VI-NEXT: s_lshr_b32 s38, s19, 8 +; VI-NEXT: s_lshr_b32 s64, s18, 16 +; VI-NEXT: s_lshr_b32 s48, s18, 8 +; VI-NEXT: s_lshr_b32 s66, s17, 24 +; VI-NEXT: s_lshr_b32 s65, s17, 16 +; VI-NEXT: s_lshr_b32 s51, s17, 8 +; VI-NEXT: s_lshr_b32 s67, s16, 16 +; VI-NEXT: s_lshr_b32 s53, s16, 8 +; VI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[28:29], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[26:27], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[24:25], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[22:23], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[20:21], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[16:17], 24 +; VI-NEXT: s_cbranch_execnz .LBB109_3 ; VI-NEXT: .LBB109_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s6, s17, 16 -; VI-NEXT: v_mov_b32_e32 v15, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s6, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s6, s16, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s6, v15 -; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s65, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s17, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s65, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s17, s6, 16 ; VI-NEXT: s_and_b32 s6, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s6, s19, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s47, s17, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s67, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s16, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s67, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s16, s6, 16 ; VI-NEXT: s_and_b32 s6, s19, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_add_f32_e32 v4, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; VI-NEXT: s_lshl_b32 s6, s18, 16 -; VI-NEXT: v_alignbit_b32 v4, v4, v3, 16 -; VI-NEXT: v_add_f32_e32 v3, s6, v15 -; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s46, s16, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s55, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s19, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s55, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s19, s6, 16 ; VI-NEXT: s_and_b32 s6, s18, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: s_lshl_b32 s6, s21, 16 -; VI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 -; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s57, s19, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s64, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s18, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s64, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s18, s6, 16 ; VI-NEXT: s_and_b32 s6, s21, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; VI-NEXT: v_add_f32_e32 v6, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; VI-NEXT: s_lshl_b32 s6, s20, 16 -; VI-NEXT: v_alignbit_b32 v6, v6, v5, 16 -; VI-NEXT: v_add_f32_e32 v5, s6, v15 -; VI-NEXT: v_bfe_u32 v7, v5, 16, 1 -; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v5 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; VI-NEXT: v_or_b32_e32 v8, 0x400000, v5 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s56, s18, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s50, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s21, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s50, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s21, s6, 16 ; VI-NEXT: s_and_b32 s6, s20, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; VI-NEXT: s_lshl_b32 s6, s23, 16 -; VI-NEXT: v_alignbit_b32 v5, v7, v5, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 -; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s59, s21, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s52, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s20, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s52, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s20, s6, 16 ; VI-NEXT: s_and_b32 s6, s23, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc -; VI-NEXT: v_add_f32_e32 v8, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; VI-NEXT: s_lshl_b32 s6, s22, 16 -; VI-NEXT: v_alignbit_b32 v8, v8, v7, 16 -; VI-NEXT: v_add_f32_e32 v7, s6, v15 -; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 -; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 -; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s58, s20, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s37, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s23, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s37, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s23, s6, 16 ; VI-NEXT: s_and_b32 s6, s22, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; VI-NEXT: s_lshl_b32 s6, s25, 16 -; VI-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 -; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s61, s23, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s39, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s22, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s39, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s22, s6, 16 ; VI-NEXT: s_and_b32 s6, s25, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc -; VI-NEXT: v_add_f32_e32 v10, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; VI-NEXT: s_lshl_b32 s6, s24, 16 -; VI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; VI-NEXT: v_add_f32_e32 v9, s6, v15 -; VI-NEXT: v_bfe_u32 v11, v9, 16, 1 -; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v9 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x7fff, v11 -; VI-NEXT: v_or_b32_e32 v12, 0x400000, v9 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s60, s22, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s91, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s25, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s91, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s25, s6, 16 ; VI-NEXT: s_and_b32 s6, s24, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; VI-NEXT: s_lshl_b32 s6, s27, 16 -; VI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x7fff, v12 -; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s63, s25, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s31, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s24, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s31, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s24, s6, 16 ; VI-NEXT: s_and_b32 s6, s27, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; VI-NEXT: v_add_f32_e32 v12, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; VI-NEXT: s_lshl_b32 s6, s26, 16 -; VI-NEXT: v_alignbit_b32 v12, v12, v11, 16 -; VI-NEXT: v_add_f32_e32 v11, s6, v15 -; VI-NEXT: v_bfe_u32 v13, v11, 16, 1 -; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v11 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x7fff, v13 -; VI-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s62, s24, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s78, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s27, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s78, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s27, s6, 16 ; VI-NEXT: s_and_b32 s6, s26, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; VI-NEXT: s_lshl_b32 s6, s29, 16 -; VI-NEXT: v_alignbit_b32 v11, v13, v11, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x7fff, v14 -; VI-NEXT: v_or_b32_e32 v16, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s73, s27, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s88, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s26, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s88, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s26, s6, 16 ; VI-NEXT: s_and_b32 s6, s29, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v14, v16, vcc -; VI-NEXT: v_add_f32_e32 v14, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v14, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v14 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v14 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; VI-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; VI-NEXT: s_lshl_b32 s6, s28, 16 -; VI-NEXT: v_alignbit_b32 v14, v14, v13, 16 -; VI-NEXT: v_add_f32_e32 v13, s6, v15 -; VI-NEXT: v_bfe_u32 v16, v13, 16, 1 -; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v13 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 -; VI-NEXT: v_or_b32_e32 v17, 0x400000, v13 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s72, s26, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s90, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s29, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s90, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s29, s6, 16 ; VI-NEXT: s_and_b32 s6, s28, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; VI-NEXT: s_lshl_b32 s6, s5, 16 -; VI-NEXT: v_alignbit_b32 v13, v16, v13, 16 -; VI-NEXT: v_add_f32_e32 v16, s6, v15 -; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 -; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 -; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; VI-NEXT: s_and_b32 s5, s5, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; VI-NEXT: v_add_f32_e32 v17, s5, v15 -; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: s_and_b32 s4, s4, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; VI-NEXT: v_add_f32_e32 v15, s4, v15 -; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc -; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 -; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 -; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v14 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v12 -; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 -; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v9 -; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v6 -; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v5 -; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v4 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v2 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 -; VI-NEXT: s_branch .LBB109_5 -; VI-NEXT: .LBB109_3: +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s75, s29, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s49, s9, s8 +; VI-NEXT: s_lshl_b32 s6, s28, 16 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_and_b32 s8, s49, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s7, 0x7fff +; VI-NEXT: s_or_b32 s10, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s6, s10, s9 +; VI-NEXT: s_lshr_b32 s28, s6, 16 +; VI-NEXT: s_and_b32 s6, s5, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s74, s28, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s54, s9, s8 +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_f32_e32 v2, s5, v1 +; VI-NEXT: v_readfirstlane_b32 s5, v2 +; VI-NEXT: s_bfe_u32 s6, s5, 0x10010 +; VI-NEXT: s_add_i32 s6, s6, s5 +; VI-NEXT: s_and_b32 s8, s54, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s6, 0x7fff +; VI-NEXT: s_bitset1_b32 s5, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s5, s5, s9 +; VI-NEXT: s_and_b32 s6, s4, 0xffff0000 +; VI-NEXT: v_add_f32_e32 v2, s6, v1 +; VI-NEXT: v_readfirstlane_b32 s6, v2 +; VI-NEXT: s_bfe_u32 s7, s6, 0x10010 +; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_add_i32 s7, s7, s6 +; VI-NEXT: s_or_b32 s35, s5, s8 +; VI-NEXT: s_add_i32 s8, s7, 0x7fff +; VI-NEXT: s_or_b32 s9, s6, 0x400000 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s66, s9, s8 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: v_add_f32_e32 v1, s4, v1 +; VI-NEXT: v_readfirstlane_b32 s4, v1 +; VI-NEXT: s_bfe_u32 s6, s4, 0x10010 +; VI-NEXT: s_add_i32 s6, s6, s4 +; VI-NEXT: s_and_b32 s8, s66, 0xffff0000 +; VI-NEXT: s_add_i32 s9, s6, 0x7fff +; VI-NEXT: s_bitset1_b32 s4, 22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_and_b64 s[6:7], vcc, exec +; VI-NEXT: s_cselect_b32 s4, s4, s9 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: s_or_b32 s34, s4, s8 +; VI-NEXT: s_lshr_b64 s[6:7], s[34:35], 24 +; VI-NEXT: s_lshr_b64 s[8:9], s[74:75], 24 +; VI-NEXT: s_lshr_b64 s[10:11], s[72:73], 24 +; VI-NEXT: s_lshr_b64 s[12:13], s[62:63], 24 +; VI-NEXT: s_lshr_b64 s[14:15], s[60:61], 24 +; VI-NEXT: s_lshr_b64 s[40:41], s[58:59], 24 +; VI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; VI-NEXT: s_lshr_b64 s[44:45], s[46:47], 24 +; VI-NEXT: s_lshr_b32 s76, s35, 8 +; VI-NEXT: s_lshr_b32 s77, s34, 8 +; VI-NEXT: s_lshr_b32 s75, s75, 8 +; VI-NEXT: s_lshr_b32 s74, s74, 8 +; VI-NEXT: s_lshr_b32 s73, s73, 8 +; VI-NEXT: s_lshr_b32 s72, s72, 8 +; VI-NEXT: s_lshr_b32 s63, s63, 8 +; VI-NEXT: s_lshr_b32 s79, s62, 8 +; VI-NEXT: s_lshr_b32 s89, s61, 8 +; VI-NEXT: s_lshr_b32 s30, s60, 8 +; VI-NEXT: s_lshr_b32 s34, s59, 8 +; VI-NEXT: s_lshr_b32 s36, s58, 8 +; VI-NEXT: s_lshr_b32 s38, s57, 8 +; VI-NEXT: s_lshr_b32 s48, s56, 8 +; VI-NEXT: s_lshr_b32 s51, s47, 8 +; VI-NEXT: s_lshr_b32 s53, s46, 8 +; VI-NEXT: s_lshr_b32 s56, s54, 24 +; VI-NEXT: s_lshr_b32 s57, s54, 16 +; VI-NEXT: s_lshr_b32 s58, s66, 16 +; VI-NEXT: s_lshr_b32 s59, s90, 24 +; VI-NEXT: s_lshr_b32 s60, s90, 16 +; VI-NEXT: s_lshr_b32 s61, s49, 16 +; VI-NEXT: s_lshr_b32 s62, s78, 24 +; VI-NEXT: s_lshr_b32 s78, s78, 16 +; VI-NEXT: s_lshr_b32 s88, s88, 16 +; VI-NEXT: s_lshr_b32 s90, s91, 24 +; VI-NEXT: s_lshr_b32 s91, s91, 16 +; VI-NEXT: s_lshr_b32 s31, s31, 16 +; VI-NEXT: s_lshr_b32 s35, s37, 24 +; VI-NEXT: s_lshr_b32 s37, s37, 16 +; VI-NEXT: s_lshr_b32 s39, s39, 16 +; VI-NEXT: s_lshr_b32 s49, s50, 24 +; VI-NEXT: s_lshr_b32 s50, s50, 16 +; VI-NEXT: s_lshr_b32 s52, s52, 16 +; VI-NEXT: s_lshr_b32 s54, s55, 24 +; VI-NEXT: s_lshr_b32 s55, s55, 16 +; VI-NEXT: s_lshr_b32 s64, s64, 16 +; VI-NEXT: s_lshr_b32 s66, s65, 24 +; VI-NEXT: s_lshr_b32 s65, s65, 16 +; VI-NEXT: s_lshr_b32 s67, s67, 16 +; VI-NEXT: .LBB109_3: ; %end +; VI-NEXT: s_and_b32 s7, s16, 0xff +; VI-NEXT: s_lshl_b32 s9, s53, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s67, 0xff +; VI-NEXT: s_lshl_b32 s11, s44, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_and_b32 s7, s17, 0xff +; VI-NEXT: s_lshl_b32 s9, s51, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s65, 0xff +; VI-NEXT: s_lshl_b32 s11, s66, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s18, 0xff +; VI-NEXT: s_lshl_b32 s9, s48, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s64, 0xff +; VI-NEXT: s_lshl_b32 s11, s42, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s19, 0xff +; VI-NEXT: s_lshl_b32 s9, s38, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s55, 0xff +; VI-NEXT: s_lshl_b32 s11, s54, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s20, 0xff +; VI-NEXT: s_lshl_b32 s9, s36, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s52, 0xff +; VI-NEXT: s_lshl_b32 s11, s40, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s21, 0xff +; VI-NEXT: s_lshl_b32 s9, s34, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s50, 0xff +; VI-NEXT: s_lshl_b32 s11, s49, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s22, 0xff +; VI-NEXT: s_lshl_b32 s9, s30, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s39, 0xff +; VI-NEXT: s_lshl_b32 s11, s14, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s23, 0xff +; VI-NEXT: s_lshl_b32 s9, s89, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s37, 0xff +; VI-NEXT: s_lshl_b32 s11, s35, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s24, 0xff +; VI-NEXT: s_lshl_b32 s9, s79, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s31, 0xff +; VI-NEXT: s_lshl_b32 s11, s12, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s25, 0xff +; VI-NEXT: s_lshl_b32 s9, s63, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s91, 0xff +; VI-NEXT: s_lshl_b32 s11, s90, 8 +; VI-NEXT: s_or_b32 s9, s9, s11 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s9, s72, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s88, 0xff +; VI-NEXT: s_lshl_b32 s10, s10, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s27, 0xff +; VI-NEXT: s_lshl_b32 s9, s73, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s78, 0xff +; VI-NEXT: s_lshl_b32 s10, s62, 8 +; VI-NEXT: s_or_b32 s9, s9, s10 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s9, s9, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v0 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s28, 0xff +; VI-NEXT: s_lshl_b32 s9, s74, 8 +; VI-NEXT: s_or_b32 s7, s7, s9 +; VI-NEXT: s_and_b32 s9, s61, 0xff +; VI-NEXT: s_lshl_b32 s8, s8, 8 +; VI-NEXT: s_or_b32 s8, s9, s8 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s7, s29, 0xff +; VI-NEXT: s_lshl_b32 s8, s75, 8 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_and_b32 s8, s60, 0xff +; VI-NEXT: s_lshl_b32 s9, s59, 8 +; VI-NEXT: s_or_b32 s8, s8, s9 +; VI-NEXT: s_and_b32 s7, s7, 0xffff +; VI-NEXT: s_lshl_b32 s8, s8, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v0 +; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xff +; VI-NEXT: s_lshl_b32 s7, s77, 8 +; VI-NEXT: s_or_b32 s4, s4, s7 +; VI-NEXT: s_and_b32 s7, s58, 0xff +; VI-NEXT: s_lshl_b32 s6, s6, 8 +; VI-NEXT: s_or_b32 s6, s7, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s6, s6, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v0 +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_and_b32 s4, s5, 0xff +; VI-NEXT: s_lshl_b32 s5, s76, 8 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_and_b32 s5, s57, 0xff +; VI-NEXT: s_lshl_b32 s6, s56, 8 +; VI-NEXT: s_or_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_readlane_b32 s67, v4, 19 +; VI-NEXT: v_readlane_b32 s66, v4, 18 +; VI-NEXT: v_readlane_b32 s65, v4, 17 +; VI-NEXT: v_readlane_b32 s64, v4, 16 +; VI-NEXT: v_readlane_b32 s55, v4, 15 +; VI-NEXT: v_readlane_b32 s54, v4, 14 +; VI-NEXT: v_readlane_b32 s53, v4, 13 +; VI-NEXT: v_readlane_b32 s52, v4, 12 +; VI-NEXT: v_readlane_b32 s51, v4, 11 +; VI-NEXT: v_readlane_b32 s50, v4, 10 +; VI-NEXT: v_readlane_b32 s49, v4, 9 +; VI-NEXT: v_readlane_b32 s48, v4, 8 +; VI-NEXT: v_readlane_b32 s39, v4, 7 +; VI-NEXT: v_readlane_b32 s38, v4, 6 +; VI-NEXT: v_readlane_b32 s37, v4, 5 +; VI-NEXT: v_readlane_b32 s36, v4, 4 +; VI-NEXT: v_readlane_b32 s35, v4, 3 +; VI-NEXT: v_readlane_b32 s34, v4, 2 +; VI-NEXT: v_readlane_b32 s31, v4, 1 +; VI-NEXT: v_readlane_b32 s30, v4, 0 +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB109_4: +; VI-NEXT: ; implicit-def: $sgpr53 ; VI-NEXT: ; implicit-def: $sgpr67 +; VI-NEXT: ; implicit-def: $sgpr44 +; VI-NEXT: ; implicit-def: $sgpr51 ; VI-NEXT: ; implicit-def: $sgpr65 -; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: ; implicit-def: $sgpr66 +; VI-NEXT: ; implicit-def: $sgpr48 ; VI-NEXT: ; implicit-def: $sgpr64 +; VI-NEXT: ; implicit-def: $sgpr42 +; VI-NEXT: ; implicit-def: $sgpr38 ; VI-NEXT: ; implicit-def: $sgpr55 ; VI-NEXT: ; implicit-def: $sgpr54 +; VI-NEXT: ; implicit-def: $sgpr36 ; VI-NEXT: ; implicit-def: $sgpr52 -; VI-NEXT: ; implicit-def: $sgpr8 -; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr51 +; VI-NEXT: ; implicit-def: $sgpr40 +; VI-NEXT: ; implicit-def: $sgpr34 ; VI-NEXT: ; implicit-def: $sgpr50 ; VI-NEXT: ; implicit-def: $sgpr49 +; VI-NEXT: ; implicit-def: $sgpr30 ; VI-NEXT: ; implicit-def: $sgpr39 -; VI-NEXT: ; implicit-def: $sgpr10 -; VI-NEXT: ; implicit-def: $sgpr48 -; VI-NEXT: ; implicit-def: $sgpr38 +; VI-NEXT: ; implicit-def: $sgpr14 +; VI-NEXT: ; implicit-def: $sgpr89 ; VI-NEXT: ; implicit-def: $sgpr37 -; VI-NEXT: ; implicit-def: $sgpr36 -; VI-NEXT: ; implicit-def: $sgpr34 -; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr35 +; VI-NEXT: ; implicit-def: $sgpr79 ; VI-NEXT: ; implicit-def: $sgpr31 -; VI-NEXT: ; implicit-def: $sgpr30 +; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: ; implicit-def: $sgpr91 -; VI-NEXT: ; implicit-def: $sgpr89 -; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr90 +; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: ; implicit-def: $sgpr88 -; VI-NEXT: ; implicit-def: $sgpr79 -; VI-NEXT: ; implicit-def: $sgpr78 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr40 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr42 -; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr78 ; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: ; implicit-def: $sgpr61 +; VI-NEXT: ; implicit-def: $sgpr8 +; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr58 -; VI-NEXT: ; implicit-def: $sgpr44 ; VI-NEXT: ; implicit-def: $sgpr59 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr58 +; VI-NEXT: ; implicit-def: $sgpr6 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: ; implicit-def: $sgpr57 ; VI-NEXT: ; implicit-def: $sgpr56 ; VI-NEXT: s_branch .LBB109_2 -; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v19, s44 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s42 -; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_mov_b32_e32 v3, s18 -; VI-NEXT: v_mov_b32_e32 v4, s19 -; VI-NEXT: v_mov_b32_e32 v5, s20 -; VI-NEXT: v_mov_b32_e32 v6, s21 -; VI-NEXT: v_mov_b32_e32 v7, s22 -; VI-NEXT: v_mov_b32_e32 v8, s23 -; VI-NEXT: v_mov_b32_e32 v9, s24 -; VI-NEXT: v_mov_b32_e32 v10, s25 -; VI-NEXT: v_mov_b32_e32 v11, s26 -; VI-NEXT: v_mov_b32_e32 v12, s27 -; VI-NEXT: v_mov_b32_e32 v13, s28 -; VI-NEXT: v_mov_b32_e32 v14, s29 -; VI-NEXT: v_mov_b32_e32 v15, s4 -; VI-NEXT: v_mov_b32_e32 v16, s5 -; VI-NEXT: v_mov_b32_e32 v18, s67 -; VI-NEXT: v_mov_b32_e32 v62, s65 -; VI-NEXT: v_mov_b32_e32 v17, s66 -; VI-NEXT: v_mov_b32_e32 v60, s64 -; VI-NEXT: v_mov_b32_e32 v61, s55 -; VI-NEXT: v_mov_b32_e32 v58, s54 -; VI-NEXT: v_mov_b32_e32 v59, s52 -; VI-NEXT: v_mov_b32_e32 v57, s53 -; VI-NEXT: v_mov_b32_e32 v47, s51 -; VI-NEXT: v_mov_b32_e32 v56, s50 -; VI-NEXT: v_mov_b32_e32 v46, s49 -; VI-NEXT: v_mov_b32_e32 v45, s39 -; VI-NEXT: v_mov_b32_e32 v44, s48 -; VI-NEXT: v_mov_b32_e32 v42, s38 -; VI-NEXT: v_mov_b32_e32 v43, s37 -; VI-NEXT: v_mov_b32_e32 v41, s36 -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: v_mov_b32_e32 v55, s35 -; VI-NEXT: v_mov_b32_e32 v53, s31 -; VI-NEXT: v_mov_b32_e32 v54, s30 -; VI-NEXT: v_mov_b32_e32 v52, s91 -; VI-NEXT: v_mov_b32_e32 v51, s89 -; VI-NEXT: v_mov_b32_e32 v50, s90 -; VI-NEXT: v_mov_b32_e32 v48, s88 -; VI-NEXT: v_mov_b32_e32 v49, s79 -; VI-NEXT: v_mov_b32_e32 v39, s78 -; VI-NEXT: v_mov_b32_e32 v38, s76 -; VI-NEXT: v_mov_b32_e32 v37, s77 -; VI-NEXT: v_mov_b32_e32 v35, s75 -; VI-NEXT: v_mov_b32_e32 v36, s74 -; VI-NEXT: v_mov_b32_e32 v34, s73 -; VI-NEXT: v_mov_b32_e32 v33, s63 -; VI-NEXT: v_mov_b32_e32 v32, s72 -; VI-NEXT: v_mov_b32_e32 v30, s62 -; VI-NEXT: v_mov_b32_e32 v31, s61 -; VI-NEXT: v_mov_b32_e32 v29, s60 -; VI-NEXT: v_mov_b32_e32 v28, s58 -; VI-NEXT: v_mov_b32_e32 v27, s59 -; VI-NEXT: v_mov_b32_e32 v25, s57 -; VI-NEXT: v_mov_b32_e32 v26, s56 -; VI-NEXT: v_mov_b32_e32 v21, s12 -; VI-NEXT: v_mov_b32_e32 v22, s10 -; VI-NEXT: v_mov_b32_e32 v23, s8 -; VI-NEXT: v_mov_b32_e32 v24, s6 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v19, s40 -; VI-NEXT: v_mov_b32_e32 v20, s14 -; VI-NEXT: .LBB109_5: ; %end -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; VI-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v24 -; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v62, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v61 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v58 -; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v57 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v46 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 -; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v49 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 -; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_readlane_b32 s67, v63, 19 -; VI-NEXT: v_readlane_b32 s66, v63, 18 -; VI-NEXT: v_readlane_b32 s65, v63, 17 -; VI-NEXT: v_readlane_b32 s64, v63, 16 -; VI-NEXT: v_readlane_b32 s55, v63, 15 -; VI-NEXT: v_readlane_b32 s54, v63, 14 -; VI-NEXT: v_readlane_b32 s53, v63, 13 -; VI-NEXT: v_readlane_b32 s52, v63, 12 -; VI-NEXT: v_readlane_b32 s51, v63, 11 -; VI-NEXT: v_readlane_b32 s50, v63, 10 -; VI-NEXT: v_readlane_b32 s49, v63, 9 -; VI-NEXT: v_readlane_b32 s48, v63, 8 -; VI-NEXT: v_readlane_b32 s39, v63, 7 -; VI-NEXT: v_readlane_b32 s38, v63, 6 -; VI-NEXT: v_readlane_b32 s37, v63, 5 -; VI-NEXT: v_readlane_b32 s36, v63, 4 -; VI-NEXT: v_readlane_b32 s35, v63, 3 -; VI-NEXT: v_readlane_b32 s34, v63, 2 -; VI-NEXT: v_readlane_b32 s31, v63, 1 -; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 6fe66655de3d6..7e3fc4805ef48 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -2014,10 +2014,10 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2029,29 +2029,33 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB22_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB22_2 ; SI-NEXT: .LBB22_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2064,12 +2068,12 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB22_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -2080,15 +2084,15 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -2098,9 +2102,9 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB22_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2273,30 +2277,34 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: .LBB23_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: @@ -2311,7 +2319,7 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB23_4 ; VI-NEXT: .LBB23_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -2319,7 +2327,7 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -2327,15 +2335,15 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -2345,9 +2353,9 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB23_3: ; VI-NEXT: s_branch .LBB23_2 @@ -5158,10 +5166,10 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5173,29 +5181,33 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB46_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB46_2 ; SI-NEXT: .LBB46_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5208,12 +5220,12 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB46_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -5224,15 +5236,15 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -5242,9 +5254,9 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB46_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5417,30 +5429,34 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: .LBB47_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: @@ -5455,7 +5471,7 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB47_4 ; VI-NEXT: .LBB47_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -5463,7 +5479,7 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -5471,15 +5487,15 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -5489,9 +5505,9 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB47_3: ; VI-NEXT: s_branch .LBB47_2 @@ -8014,10 +8030,10 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -8029,29 +8045,33 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB66_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB66_2 ; SI-NEXT: .LBB66_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8064,12 +8084,12 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB66_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -8080,15 +8100,15 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -8098,9 +8118,9 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB66_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -8273,30 +8293,34 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB67_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: s_cbranch_execnz .LBB67_3 ; SI-NEXT: .LBB67_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: .LBB67_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB67_4: @@ -8311,7 +8335,7 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB67_4 ; VI-NEXT: .LBB67_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -8319,7 +8343,7 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -8327,15 +8351,15 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -8345,9 +8369,9 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB67_3: ; VI-NEXT: s_branch .LBB67_2 @@ -10543,10 +10567,10 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -10558,29 +10582,33 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB82_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB82_2 ; SI-NEXT: .LBB82_4: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -10593,12 +10621,12 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB82_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -10609,15 +10637,15 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -10627,9 +10655,9 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB82_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -10802,30 +10830,34 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB83_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: s_cbranch_execnz .LBB83_3 ; SI-NEXT: .LBB83_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: .LBB83_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB83_4: @@ -10840,7 +10872,7 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB83_4 ; VI-NEXT: .LBB83_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -10848,7 +10880,7 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -10856,15 +10888,15 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -10874,9 +10906,9 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB83_3: ; VI-NEXT: s_branch .LBB83_2 @@ -12733,19 +12765,21 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB94_2 ; SI-NEXT: .LBB94_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -12792,10 +12826,10 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB94_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -12983,19 +13017,21 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: .LBB95_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] @@ -13028,12 +13064,11 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 @@ -13048,9 +13083,10 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB95_3: ; VI-NEXT: s_branch .LBB95_2 @@ -14617,10 +14653,10 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB102_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -14856,12 +14892,11 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 @@ -14876,9 +14911,10 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB103_3: ; VI-NEXT: s_branch .LBB103_2 @@ -16095,10 +16131,10 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v11, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v3 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -16117,15 +16153,18 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB108_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -16133,18 +16172,21 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: .LBB108_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -16178,50 +16220,51 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB108_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_or_b32_e32 v1, v9, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_cndmask_b32_e32 v10, v3, v4, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v1, v6, v2, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_or_b32_e32 v0, v8, v0 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; VI-NEXT: .LBB108_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v8 @@ -16471,35 +16514,41 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -16526,74 +16575,75 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_lshr_b32 s8, s17, 24 -; VI-NEXT: s_lshr_b32 s5, s17, 16 -; VI-NEXT: s_lshr_b32 s9, s17, 8 -; VI-NEXT: s_lshr_b32 s10, s16, 16 -; VI-NEXT: s_lshr_b32 s11, s16, 8 +; VI-NEXT: s_lshr_b32 s10, s17, 16 +; VI-NEXT: s_lshr_b32 s5, s17, 8 +; VI-NEXT: s_lshr_b32 s11, s16, 16 +; VI-NEXT: s_lshr_b32 s9, s16, 8 ; VI-NEXT: s_cbranch_execnz .LBB109_4 ; VI-NEXT: .LBB109_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v7, v2, v3, vcc +; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_e32 v2, v8, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16 -; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; VI-NEXT: v_mov_b32_e32 v4, v8 ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB109_3: +; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr4 -; VI-NEXT: ; implicit-def: $sgpr9 ; VI-NEXT: ; implicit-def: $sgpr5 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr8 ; VI-NEXT: s_branch .LBB109_2 ; VI-NEXT: .LBB109_4: -; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s11 +; VI-NEXT: v_mov_b32_e32 v6, s10 ; VI-NEXT: v_mov_b32_e32 v7, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v0, s16 -; VI-NEXT: v_mov_b32_e32 v6, s5 ; VI-NEXT: v_mov_b32_e32 v4, s17 ; VI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index e5245f7bd71d3..18c503cc7a6ed 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1896,12 +1896,12 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1913,12 +1913,15 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB10_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -1928,24 +1931,27 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB10_2 ; SI-NEXT: .LBB10_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1958,12 +1964,12 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB10_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -1974,15 +1980,15 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -1992,15 +1998,15 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -2010,9 +2016,9 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB10_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2228,40 +2234,46 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB11_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 ; SI-NEXT: s_cbranch_execnz .LBB11_3 ; SI-NEXT: .LBB11_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: .LBB11_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB11_4: @@ -2276,7 +2288,7 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB11_4 ; VI-NEXT: .LBB11_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -2284,7 +2296,7 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -2292,17 +2304,17 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 @@ -2310,15 +2322,15 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -2328,9 +2340,9 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB11_3: ; VI-NEXT: s_branch .LBB11_2 @@ -5039,12 +5051,12 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v9, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5056,12 +5068,15 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB26_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v7, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -5071,24 +5086,27 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB26_2 ; SI-NEXT: .LBB26_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5101,12 +5119,12 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB26_2 ; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 @@ -5117,15 +5135,15 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 @@ -5135,15 +5153,15 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 -; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 @@ -5153,9 +5171,9 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB26_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5371,40 +5389,46 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v9 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: .LBB27_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: @@ -5419,7 +5443,7 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_cbranch_execnz .LBB27_4 ; VI-NEXT: .LBB27_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 ; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -5427,7 +5451,7 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; VI-NEXT: v_add_f32_e32 v2, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 @@ -5435,17 +5459,17 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v1, s4, v0 ; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 @@ -5453,15 +5477,15 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f32_e32 v3, s4, v0 ; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 +; VI-NEXT: s_lshl_b32 s4, s16, 16 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; VI-NEXT: v_add_f32_e32 v0, s4, v0 @@ -5471,9 +5495,9 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a, ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_3: ; VI-NEXT: s_branch .LBB27_2 @@ -7465,12 +7489,12 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v12, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -7493,46 +7517,56 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB38_3: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v8, v6, v7 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: .LBB38_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -7546,9 +7580,9 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-LABEL: bitcast_v6bf16_to_v12i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v13, v2 -; VI-NEXT: v_mov_b32_e32 v16, v1 -; VI-NEXT: v_mov_b32_e32 v15, v0 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v14, v1 +; VI-NEXT: v_mov_b32_e32 v13, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr2 @@ -7563,89 +7597,91 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB38_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 ; VI-NEXT: .LBB38_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB38_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 ; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 ; VI-NEXT: s_movk_i32 s6, 0x7fff -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v16, v1, v0, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 -; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v15, v1, v0, 16 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; VI-NEXT: v_or_b32_e32 v1, v14, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 ; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 -; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v13, v1, v0, 16 -; VI-NEXT: v_mov_b32_e32 v14, 0x7fc07fc0 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; VI-NEXT: v_cndmask_b32_e32 v15, v3, v4, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v10, v4, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_or_b32_e32 v0, v13, v0 +; VI-NEXT: v_or_b32_e32 v6, v8, v3 +; VI-NEXT: v_mov_b32_e32 v7, 0x7fc07fc0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 ; VI-NEXT: .LBB38_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, v15 -; VI-NEXT: v_mov_b32_e32 v4, v16 -; VI-NEXT: v_mov_b32_e32 v8, v13 +; VI-NEXT: v_mov_b32_e32 v0, v13 +; VI-NEXT: v_mov_b32_e32 v4, v14 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v6bf16_to_v12i8: @@ -7981,47 +8017,57 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v12 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 +; SI-NEXT: v_or_b32_e32 v8, v6, v7 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_or_b32_e32 v4, v1, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_or_b32_e32 v8, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 -; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 ; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 ; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 @@ -8052,111 +8098,111 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: s_cmp_lg_u32 s19, 0 ; VI-NEXT: s_cbranch_scc0 .LBB39_3 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: s_lshr_b32 s19, s16, 8 -; VI-NEXT: s_lshr_b32 s10, s18, 16 +; VI-NEXT: s_lshr_b32 s19, s18, 16 ; VI-NEXT: s_lshr_b32 s11, s18, 8 -; VI-NEXT: s_lshr_b32 s12, s17, 24 +; VI-NEXT: s_lshr_b32 s14, s17, 24 ; VI-NEXT: s_lshr_b32 s13, s17, 16 -; VI-NEXT: s_lshr_b32 s15, s17, 8 -; VI-NEXT: s_lshr_b32 s14, s16, 16 +; VI-NEXT: s_lshr_b32 s10, s17, 8 +; VI-NEXT: s_lshr_b32 s15, s16, 16 +; VI-NEXT: s_lshr_b32 s12, s16, 8 ; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 ; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 ; VI-NEXT: s_cbranch_execnz .LBB39_4 ; VI-NEXT: .LBB39_2: ; %cmp.true -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: s_and_b32 s4, s17, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s16, 16 -; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: s_lshl_b32 s4, s17, 16 +; VI-NEXT: v_cndmask_b32_e32 v14, v1, v2, vcc +; VI-NEXT: v_add_f32_e32 v1, s4, v3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 ; VI-NEXT: s_and_b32 s4, s16, 0xffff0000 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_add_f32_e32 v2, s4, v0 -; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16 -; VI-NEXT: v_add_f32_e32 v1, s4, v0 -; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_add_f32_e32 v0, s4, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_or_b32_e32 v2, v13, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: s_lshl_b32 s4, s16, 16 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_add_f32_e32 v0, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v15, v1, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: s_and_b32 s4, s18, 0xffff0000 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_add_f32_e32 v4, s4, v3 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_lshl_b32 s4, s18, 16 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v3, s4, v3 +; VI-NEXT: v_cndmask_b32_e32 v10, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16 -; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0 -; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15] -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 -; VI-NEXT: s_branch .LBB39_5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v10 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_or_b32_e32 v6, v8, v4 +; VI-NEXT: v_mov_b32_e32 v7, 0x7fc07fc0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[6:7] +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_mov_b32_e32 v4, v13 +; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB39_3: -; VI-NEXT: ; implicit-def: $sgpr19 -; VI-NEXT: ; implicit-def: $sgpr14 -; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr12 ; VI-NEXT: ; implicit-def: $sgpr15 +; VI-NEXT: ; implicit-def: $sgpr4 +; VI-NEXT: ; implicit-def: $sgpr10 ; VI-NEXT: ; implicit-def: $sgpr13 -; VI-NEXT: ; implicit-def: $sgpr12 +; VI-NEXT: ; implicit-def: $sgpr14 ; VI-NEXT: ; implicit-def: $sgpr11 -; VI-NEXT: ; implicit-def: $sgpr10 +; VI-NEXT: ; implicit-def: $sgpr19 ; VI-NEXT: ; implicit-def: $sgpr6 ; VI-NEXT: s_branch .LBB39_2 ; VI-NEXT: .LBB39_4: -; VI-NEXT: v_mov_b32_e32 v14, s16 -; VI-NEXT: v_mov_b32_e32 v15, s17 ; VI-NEXT: v_mov_b32_e32 v8, s18 -; VI-NEXT: v_mov_b32_e32 v1, s19 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: v_mov_b32_e32 v10, s19 +; VI-NEXT: v_mov_b32_e32 v0, s16 +; VI-NEXT: v_mov_b32_e32 v2, s15 +; VI-NEXT: v_mov_b32_e32 v7, s14 ; VI-NEXT: v_mov_b32_e32 v6, s13 -; VI-NEXT: v_mov_b32_e32 v7, s12 -; VI-NEXT: v_mov_b32_e32 v13, s11 -; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v9, s11 +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v5, s10 ; VI-NEXT: v_mov_b32_e32 v11, s6 ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: .LBB39_5: ; %end -; VI-NEXT: v_mov_b32_e32 v0, v14 -; VI-NEXT: v_mov_b32_e32 v4, v15 -; VI-NEXT: v_mov_b32_e32 v9, v13 +; VI-NEXT: v_mov_b32_e32 v4, s17 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v6bf16_to_v12i8_scalar: @@ -11368,12 +11414,12 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB48_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -11675,12 +11721,11 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 @@ -11692,12 +11737,11 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 @@ -11712,10 +11756,12 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB49_3: ; VI-NEXT: s_branch .LBB49_2 @@ -12234,25 +12280,29 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: .LBB52_4: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_setpc_b64 s[30:31] @@ -12315,12 +12365,12 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 -; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 -; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -12563,25 +12613,29 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; SI-NEXT: v_or_b32_e32 v4, v2, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: s_setpc_b64 s[30:31] @@ -12616,12 +12670,11 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: s_lshl_b32 s4, s17, 16 ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc -; VI-NEXT: s_lshl_b32 s4, s17, 16 -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-NEXT: v_add_f32_e32 v1, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 @@ -12633,12 +12686,11 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: s_lshl_b32 s4, s18, 16 ; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; VI-NEXT: s_lshl_b32 s4, s18, 16 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NEXT: v_add_f32_e32 v2, s4, v0 +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 @@ -12653,10 +12705,12 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16 -; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB53_3: ; VI-NEXT: s_branch .LBB53_2 diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index cc9f595f9d0b6..4c2fd3323220a 100644 --- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -6,46 +6,46 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) %arg1) local_unnamed_addr #0 { ; GFX6-LABEL: any_extend_vector_inreg_v16i8_to_v4i32: ; GFX6: ; %bb.0: ; %bb -; GFX6-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s14 -; GFX6-NEXT: s_mov_b32 s1, s15 -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0 +; GFX6-NEXT: s_mov_b32 s12, s10 +; GFX6-NEXT: s_mov_b32 s13, s11 +; GFX6-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s4, s[12:13], 0x8 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:13 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:15 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:14 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:11 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:10 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:4 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:6 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3 -; GFX6-NEXT: s_lshr_b32 s8, s9, 16 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_load_dword s0, s[8:9], 0x8 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:13 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:15 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:14 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:8 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:11 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:10 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:4 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:6 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[6:7], s[4:5], 8 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:9 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 8 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:9 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GFX6-NEXT: s_lshr_b32 s0, s2, 24 +; GFX6-NEXT: s_lshr_b32 s1, s5, 24 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 -; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 16 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:5 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:12 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:7 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:5 -; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:12 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: any_extend_vector_inreg_v16i8_to_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 3ca7db155b385..7896bdfbaeba8 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -12144,13 +12144,14 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7LESS-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7LESS-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7LESS-NEXT: v_add_f32_e32 v4, v4, v0 -; GFX7LESS-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7LESS-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX7LESS-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX7LESS-NEXT: v_add_f32_e32 v5, v2, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7LESS-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7LESS-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[4:5], off, s[4:7], 0 glc @@ -12165,10 +12166,11 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v3 +; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7LESS-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7LESS-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX7LESS-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -12181,32 +12183,32 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX8-NEXT: s_lshl_b32 s12, s0, 16 -; GFX8-NEXT: s_and_b32 s13, s0, 0xffff0000 +; GFX8-NEXT: s_and_b32 s12, s0, 0xffff0000 +; GFX8-NEXT: s_lshl_b32 s13, s0, 16 ; GFX8-NEXT: s_mov_b32 s4, s10 ; GFX8-NEXT: s_mov_b32 s5, s11 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_add_f32_e32 v0, s12, v0 ; GFX8-NEXT: v_add_f32_e32 v2, s13, v2 ; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v4, s[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index cd6d741beeab3..f29077191f74d 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -936,8 +936,9 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -949,10 +950,11 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -998,11 +1000,12 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -1012,9 +1015,10 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 @@ -1071,10 +1075,12 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v6 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -1086,14 +1092,16 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -1134,26 +1142,30 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-LABEL: v_store_global_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v11, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v12 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 -; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1162,25 +1174,29 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1223,12 +1239,10 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 @@ -1236,24 +1250,34 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 -; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 -; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v19, v18 +; GCN-NEXT: v_or_b32_e32 v0, v0, v20 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -1263,41 +1287,49 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v14, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v13, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v12, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1348,26 +1380,32 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v21, v22, v23 +; GCN-NEXT: v_or_b32_e32 v20, v20, v31 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v19, v18, v19 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v16, v17 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v5, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v4, v16 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 @@ -1381,45 +1419,55 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v1, v0 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v30 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v9 ; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16 -; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v7, v6 +; GCN-NEXT: v_or_b32_e32 v9, v14, v15 +; GCN-NEXT: v_or_b32_e32 v8, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v28, v27 +; GCN-NEXT: v_or_b32_e32 v12, v17, v16 +; GCN-NEXT: v_or_b32_e32 v11, v23, v22 +; GCN-NEXT: v_or_b32_e32 v10, v24, v25 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26 ; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v13 +; GCN-NEXT: v_or_b32_e32 v13, v29, v6 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -1428,80 +1476,96 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-LABEL: v_store_global_v32bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 +; GFX7-NEXT: v_or_b32_e32 v13, v1, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 ; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v12, v7, v12, 16 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX7-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16 -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v28, v6, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v12, v7, v6 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v10, v7, v6 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22 -; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v7, v6 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v6 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_or_b32_e32 v16, v18, v16 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s4, s6 +; GFX7-NEXT: s_mov_b32 s5, s6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v31 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_or_b32_e32 v17, v15, v14 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v15, v15, v14 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_or_b32_e32 v14, v18, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 @@ -1565,26 +1629,32 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v21, v22, v23 +; GCN-NEXT: v_or_b32_e32 v20, v20, v31 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v19, v18, v19 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v16, v17 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v13, v14, v15 +; GCN-NEXT: v_or_b32_e32 v12, v12, v16 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 @@ -1596,29 +1666,37 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16 -; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16 -; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16 -; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v11, v10, v11 +; GCN-NEXT: v_or_b32_e32 v10, v8, v9 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v23, v22 +; GCN-NEXT: v_or_b32_e32 v0, v0, v26 +; GCN-NEXT: v_or_b32_e32 v6, v15, v14 +; GCN-NEXT: v_or_b32_e32 v5, v17, v16 ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 @@ -1643,9 +1721,9 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_or_b32_e32 v4, v23, v4 ; GCN-NEXT: s_waitcnt vmcnt(14) ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 @@ -1660,41 +1738,48 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16 -; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16 -; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16 -; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16 -; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v13, v14, v7 +; GCN-NEXT: v_or_b32_e32 v12, v16, v15 +; GCN-NEXT: v_or_b32_e32 v11, v22, v17 +; GCN-NEXT: v_or_b32_e32 v10, v23, v10 +; GCN-NEXT: v_or_b32_e32 v17, v25, v24 +; GCN-NEXT: v_or_b32_e32 v16, v19, v18 +; GCN-NEXT: v_or_b32_e32 v15, v21, v20 ; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(6) @@ -1711,63 +1796,74 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21 -; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16 -; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16 -; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_or_b32_e32 v14, v14, v7 +; GCN-NEXT: v_or_b32_e32 v21, v19, v18 +; GCN-NEXT: v_or_b32_e32 v20, v24, v20 +; GCN-NEXT: v_or_b32_e32 v19, v23, v22 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:16 ; GCN-NEXT: s_waitcnt vmcnt(7) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16 +; GCN-NEXT: v_or_b32_e32 v18, v18, v7 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_or_b32_e32 v7, v7, v22 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; GCN-NEXT: v_or_b32_e32 v25, v23, v22 +; GCN-NEXT: v_or_b32_e32 v24, v26, v24 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:12 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v23, v22, v23 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v22, v26, v22 ; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112 ; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64 +; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:80 +; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:64 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -1789,47 +1885,58 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 ; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_alignbit_b32 v36, v31, v32, 16 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GFX7-NEXT: v_or_b32_e32 v36, v32, v31 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v37 -; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v35, v33, v34, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v31, v32, 16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_or_b32_e32 v35, v34, v33 +; GFX7-NEXT: v_or_b32_e32 v34, v32, v31 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v39 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v48 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_or_b32_e32 v33, v32, v31 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:136 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 @@ -1844,19 +1951,22 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37 ; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v48 +; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34 ; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v36 +; GFX7-NEXT: v_or_b32_e32 v36, v34, v33 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49 -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX7-NEXT: v_or_b32_e32 v35, v37, v35 +; GFX7-NEXT: v_or_b32_e32 v34, v34, v33 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64 @@ -1867,26 +1977,32 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 ; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX7-NEXT: v_or_b32_e32 v33, v37, v33 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v49 ; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v36 +; GFX7-NEXT: v_or_b32_e32 v36, v34, v33 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX7-NEXT: v_or_b32_e32 v35, v37, v35 +; GFX7-NEXT: v_or_b32_e32 v34, v34, v33 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 @@ -1897,83 +2013,100 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX7-NEXT: v_or_b32_e32 v33, v37, v33 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49 +; GFX7-NEXT: s_waitcnt vmcnt(6) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX7-NEXT: s_waitcnt vmcnt(3) +; GFX7-NEXT: v_mul_f32_e32 v36, 1.0, v49 ; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48 -; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v36 +; GFX7-NEXT: v_or_b32_e32 v36, v34, v33 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50 -; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51 -; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16 -; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16 +; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GFX7-NEXT: v_or_b32_e32 v35, v37, v35 +; GFX7-NEXT: v_or_b32_e32 v34, v34, v33 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX7-NEXT: v_or_b32_e32 v33, v37, v33 ; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64 ; GFX7-NEXT: s_nop 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_or_b32_e32 v13, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v10, v1, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v12, v7, v6 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16 -; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v8, v7, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v17, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16 -; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v16, v15, v14 +; GFX7-NEXT: v_or_b32_e32 v15, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v14, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v33 ; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48 ; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32 ; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16 @@ -2887,8 +3020,9 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -2900,10 +3034,11 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -2949,11 +3084,12 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4 ; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -2963,9 +3099,10 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 @@ -3022,10 +3159,12 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v6 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 @@ -3037,14 +3176,16 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -3085,26 +3226,30 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_arg_store_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v11, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v12 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NEXT: v_alignbit_b32 v3, v2, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v7, v10, 16 -; GCN-NEXT: v_alignbit_b32 v0, v11, v0, 16 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -3113,25 +3258,29 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -3174,12 +3323,10 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 @@ -3187,24 +3334,34 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GCN-NEXT: s_mov_b32 s4, s6 ; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v15, v18, 16 -; GCN-NEXT: v_alignbit_b32 v0, v19, v0, 16 -; GCN-NEXT: v_alignbit_b32 v7, v20, v14, 16 -; GCN-NEXT: v_alignbit_b32 v6, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v5, v11, v10, 16 -; GCN-NEXT: v_alignbit_b32 v4, v9, v8, 16 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v1, v19, v18 +; GCN-NEXT: v_or_b32_e32 v0, v0, v20 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 ; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:16 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -3214,41 +3371,49 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v14, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v13, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; GFX7-NEXT: v_alignbit_b32 v12, v0, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v12, v1, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4221,9 +4386,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_short v2, v4, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen @@ -4256,9 +4422,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3 @@ -9156,13 +9323,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-LABEL: v_fadd_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 @@ -9173,9 +9340,9 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fadd_v2bf16: @@ -9358,9 +9525,9 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fadd_v3bf16: @@ -9598,10 +9765,10 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fadd_v4bf16: @@ -9953,18 +10120,18 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fadd_v8bf16: @@ -10638,22 +10805,22 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fadd_v16bf16: @@ -11816,8 +11983,8 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_add_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; GFX8-NEXT: v_add_f32_e32 v33, v33, v34 @@ -12080,36 +12247,36 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 +; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fadd_v32bf16: @@ -13593,13 +13760,13 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-LABEL: v_fsub_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_sub_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 @@ -13610,9 +13777,9 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fsub_v2bf16: @@ -13795,9 +13962,9 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fsub_v3bf16: @@ -14035,10 +14202,10 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fsub_v4bf16: @@ -14355,13 +14522,13 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-LABEL: v_fmul_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 @@ -14372,9 +14539,9 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmul_v2bf16: @@ -14557,9 +14724,9 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmul_v3bf16: @@ -14797,10 +14964,10 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmul_v4bf16: @@ -15152,18 +15319,18 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmul_v8bf16: @@ -15837,22 +16004,22 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmul_v16bf16: @@ -17015,8 +17182,8 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; GFX8-NEXT: v_mul_f32_e32 v33, v33, v34 @@ -17279,36 +17446,36 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 +; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmul_v32bf16: @@ -19102,13 +19269,13 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-LABEL: v_minnum_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_min_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 @@ -19119,9 +19286,9 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minnum_v2bf16: @@ -19304,9 +19471,9 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minnum_v3bf16: @@ -19544,10 +19711,10 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minnum_v4bf16: @@ -19899,18 +20066,18 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minnum_v8bf16: @@ -20584,22 +20751,22 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minnum_v16bf16: @@ -21762,8 +21929,8 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; GFX8-NEXT: v_min_f32_e32 v33, v33, v34 @@ -22026,36 +22193,36 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 +; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minnum_v32bf16: @@ -23356,13 +23523,13 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-LABEL: v_maxnum_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 ; GFX8-NEXT: v_max_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 @@ -23373,9 +23540,9 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maxnum_v2bf16: @@ -23558,9 +23725,9 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maxnum_v3bf16: @@ -23798,10 +23965,10 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maxnum_v4bf16: @@ -24153,18 +24320,18 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maxnum_v8bf16: @@ -24838,22 +25005,22 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maxnum_v16bf16: @@ -26016,8 +26183,8 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX8-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX8-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX8-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX8-NEXT: v_or_b32_sdwa v14, v31, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; GFX8-NEXT: v_max_f32_e32 v33, v33, v34 @@ -26280,36 +26447,36 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_or_b32_e32 v33, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 -; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 -; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 -; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 -; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 -; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 -; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 -; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 -; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 -; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 -; GFX8-NEXT: v_alignbit_b32 v13, v13, v32, 16 -; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v30 +; GFX8-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v21, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v23, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v9, v26, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v10, v27, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v12, v29, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v13, v32, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maxnum_v32bf16: @@ -32680,22 +32847,22 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16: @@ -32823,6 +32990,7 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -32831,7 +32999,7 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc @@ -32841,9 +33009,9 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16: @@ -33009,7 +33177,6 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 @@ -33018,28 +33185,29 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX8-NEXT: v_bfe_u32 v4, v5, 16, 1 ; GFX8-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16: @@ -33297,22 +33465,22 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-LABEL: v_sitofp_v2i32_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16: @@ -33428,6 +33596,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 @@ -33436,7 +33605,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc @@ -33447,8 +33616,8 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -33613,7 +33782,7 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc @@ -33623,10 +33792,10 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16: @@ -34003,36 +34172,37 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX8-NEXT: v_ffbh_i32_e32 v4, v1 +; GFX8-NEXT: v_xor_b32_e32 v5, v2, v3 +; GFX8-NEXT: v_ffbh_i32_e32 v4, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v5 ; GFX8-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4 -; GFX8-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v0 -; GFX8-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v1 -; GFX8-NEXT: v_min_u32_e32 v6, v0, v1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX8-NEXT: v_xor_b32_e32 v6, v0, v1 +; GFX8-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX8-NEXT: v_ffbh_i32_e32 v5, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 32, v6 +; GFX8-NEXT: v_min_u32_e32 v5, v5, v6 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v5 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 @@ -34040,8 +34210,7 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16: @@ -34382,8 +34551,8 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16: @@ -34789,16 +34958,16 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX8-NEXT: v_xor_b32_e32 v9, v0, v1 -; GFX8-NEXT: v_ffbh_i32_e32 v8, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX8-NEXT: v_ffbh_i32_e32 v7, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, -1, v8 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 32, v9 -; GFX8-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, -1, v7 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v8 +; GFX8-NEXT: v_min_u32_e32 v7, v7, v8 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 @@ -34807,10 +34976,10 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v7 ; GFX8-NEXT: v_ldexp_f32 v6, v0, v1 ; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6 @@ -34836,10 +35005,10 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16: @@ -35288,8 +35457,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 @@ -35301,9 +35470,9 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16: @@ -35452,9 +35621,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16: @@ -35623,7 +35792,6 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 @@ -35632,28 +35800,29 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX8-NEXT: v_bfe_u32 v3, v5, 16, 1 ; GFX8-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX8-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v5 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16: @@ -35916,22 +36085,22 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16: @@ -36047,6 +36216,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 @@ -36055,7 +36225,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc @@ -36066,8 +36236,8 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -36232,7 +36402,7 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc @@ -36242,10 +36412,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16: @@ -36574,27 +36744,28 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_ffbh_u32_e32 v4, v1 +; GFX8-NEXT: v_ffbh_u32_e32 v4, v3 ; GFX8-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v4 -; GFX8-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX8-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v0 -; GFX8-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX8-NEXT: v_min_u32_e32 v6, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX8-NEXT: v_ffbh_u32_e32 v5, v1 +; GFX8-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX8-NEXT: v_min_u32_e32 v5, 32, v5 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v4 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX8-NEXT: v_ldexp_f32 v2, v2, v3 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v5 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0 @@ -36602,8 +36773,7 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16: @@ -36870,8 +37040,8 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16: @@ -37188,14 +37358,14 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_ffbh_u32_e32 v4, v7 ; GFX8-NEXT: v_min_u32_e32 v10, 32, v4 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX8-NEXT: v_ffbh_u32_e32 v7, v1 ; GFX8-NEXT: v_min_u32_e32 v4, 1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX8-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v8 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX8-NEXT: v_ffbh_u32_e32 v8, v1 -; GFX8-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 32, v10 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v6 @@ -37205,10 +37375,10 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 32, v7 ; GFX8-NEXT: v_ldexp_f32 v6, v0, v1 ; GFX8-NEXT: v_bfe_u32 v0, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v6 @@ -37230,10 +37400,10 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16: @@ -38231,49 +38401,53 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GCN-LABEL: v_select_v3bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_select_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_select_v3bf16: @@ -38328,14 +38502,18 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v8, v7, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v8 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -38349,22 +38527,26 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -38430,18 +38612,24 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v6, v12, v11, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v2, v7, v8 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v4, v9, v10 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: v_or_b32_e32 v6, v11, v12 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc @@ -38458,30 +38646,36 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v9 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc @@ -38558,22 +38752,30 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; GCN-NEXT: v_alignbit_b32 v2, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; GCN-NEXT: v_alignbit_b32 v6, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v8, v16, v15, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v2, v9, v10 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v4, v11, v12 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: v_or_b32_e32 v6, v13, v14 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_or_b32_e32 v8, v15, v16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -38593,38 +38795,46 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v11 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v9, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v13 -; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v9, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v9, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v15 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc @@ -38692,16 +38902,19 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v2, v17, v2 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 @@ -38727,32 +38940,44 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v4, v17, v4 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 ; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_or_b32_e32 v8, v21, v20 +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_or_b32_e32 v10, v23, v22 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_or_b32_e32 v12, v25, v24 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: v_or_b32_e32 v14, v27, v26 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc ; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc @@ -38779,8 +39004,9 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v16, 16 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v14, v16, v14 ; GCN-NEXT: v_cndmask_b32_e32 v15, v14, v15, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -38790,68 +39016,83 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v17, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v17, v4 ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v18, v19, v18 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23 -; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v8, v19, v8 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX7-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v12, v19, v12 +; GFX7-NEXT: v_or_b32_e32 v13, v13, v14 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30 -; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29 +; GFX7-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16 +; GFX7-NEXT: v_or_b32_e32 v14, v19, v14 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc @@ -38874,10 +39115,11 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v16 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 @@ -38953,200 +39195,232 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v14 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v20 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 ; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v22 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v24 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v26 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v28 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v30 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_alignbit_b32 v14, v14, v20, 16 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v14, v20, v14 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:24 ; GCN-NEXT: s_waitcnt vmcnt(5) ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(4) ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 ; GCN-NEXT: s_waitcnt vmcnt(4) ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_alignbit_b32 v17, v17, v19, 16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_or_b32_e32 v17, v19, v17 ; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v18, v20, v18, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v18, v18, v20 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_alignbit_b32 v19, v19, v20, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v19, v20, v19 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:56 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v21, v22, v21 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_alignbit_b32 v23, v23, v24, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_alignbit_b32 v25, v25, v26, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_alignbit_b32 v27, v27, v28, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v27, v28, v27 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_or_b32_e32 v28, v29, v28 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_alignbit_b32 v29, v29, v30, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v29, v30, v29 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v32 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 ; GCN-NEXT: v_cndmask_b32_e32 v31, v31, v30, vcc ; GCN-NEXT: v_cndmask_b32_e32 v29, v29, v14, vcc ; GCN-NEXT: v_cndmask_b32_e32 v28, v28, v13, vcc @@ -39201,242 +39475,273 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; GFX7-NEXT: v_alignbit_b32 v27, v28, v27, 16 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX7-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v27, v27, v28 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; GFX7-NEXT: v_alignbit_b32 v23, v24, v23, 16 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX7-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v23, v23, v24 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_alignbit_b32 v19, v20, v19, 16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_or_b32_e32 v17, v17, v18 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX7-NEXT: v_or_b32_e32 v21, v21, v22 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX7-NEXT: v_or_b32_e32 v25, v25, v26 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX7-NEXT: v_or_b32_e32 v29, v29, v30 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; GFX7-NEXT: s_waitcnt vmcnt(14) -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(13) -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(12) -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 ; GFX7-NEXT: s_waitcnt vmcnt(11) ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; GFX7-NEXT: s_waitcnt vmcnt(9) ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(7) ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(5) ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(4) ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(3) ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v6, v2, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX7-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX7-NEXT: v_alignbit_b32 v9, v9, v10, 16 -; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_or_b32_e32 v8, v31, v8 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_cndmask_b32_e32 v8, v8, v4, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX7-NEXT: v_alignbit_b32 v10, v10, v31, 16 +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v10, v31, v10 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v12, v12, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v12, v31, v12 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v14, v14, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v14, v31, v14 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v16, v16, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v16, v31, v16 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v18, v18, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v18, v31, v18 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v20, v20, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v20, v31, v20 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v22, v22, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v22, v31, v22 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v24, v24, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v24, v31, v24 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v26, v26, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v26, v31, v26 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_cndmask_b32_e32 v25, v26, v25, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v28, v28, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v28, v31, v28 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_cndmask_b32_e32 v27, v28, v27, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX7-NEXT: v_or_b32_e32 v30, v31, v30 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_cndmask_b32_e32 v29, v30, v29, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX7-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_or_b32_e32 v31, v32, v31 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX7-NEXT: v_alignbit_b32 v32, v32, v33, 16 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_or_b32_e32 v32, v33, v32 ; GFX7-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 @@ -39556,12 +39861,14 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3 ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_or_b32_e32 v2, v4, v3 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -39572,13 +39879,15 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX7-LABEL: s_select_v3bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -39667,14 +39976,18 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2 ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7 ; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_or_b32_e32 v2, v4, v3 +; GCN-NEXT: v_or_b32_e32 v3, v6, v5 +; GCN-NEXT: v_or_b32_e32 v4, v8, v7 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -39685,21 +39998,25 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX7-LABEL: s_select_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5 ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3 ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7 ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -42888,15 +43205,15 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-LABEL: v_fma_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 @@ -42907,9 +43224,9 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v2bf16: @@ -43113,9 +43430,9 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v3bf16: @@ -43388,10 +43705,10 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_v4bf16: @@ -43798,8 +44115,8 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-LABEL: v_fmuladd_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 @@ -43808,13 +44125,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3 @@ -43827,16 +44144,16 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmuladd_v2bf16: @@ -44140,9 +44457,9 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmuladd_v3bf16: @@ -44554,10 +44871,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fmuladd_v4bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 4787f21e28598..ac4eb9ca77a86 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -604,15 +604,17 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { ; SI-LABEL: v_bswap_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v2, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v2i16: @@ -635,20 +637,21 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { ; SI-LABEL: v_bswap_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v4, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 ; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v3i16: @@ -673,25 +676,27 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) { ; SI-LABEL: v_bswap_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 +; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8 -; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 -; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v4 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v6 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 +; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8 +; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 +; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index f4b432dce8c8a..7f03db19c1668 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -9347,31 +9347,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc @@ -9402,16 +9402,17 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -9444,17 +9445,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -9773,30 +9775,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -9828,16 +9830,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc @@ -9870,17 +9873,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc @@ -10437,13 +10441,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB28_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 @@ -10451,16 +10455,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v9 ; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 @@ -10524,15 +10528,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB28_4 Depth 2 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v9 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v4, v7, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 @@ -10598,15 +10603,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB28_4 Depth 2 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_add_f32_e32 v4, v4, v10 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v9 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v4, v7, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 @@ -10954,31 +10960,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc @@ -11009,16 +11015,17 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -11051,17 +11058,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add ; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -11380,30 +11388,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -11435,16 +11443,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc @@ -11477,17 +11486,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc @@ -11817,31 +11827,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc @@ -11872,16 +11882,17 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -11914,17 +11925,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -12243,30 +12255,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -12298,16 +12310,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc @@ -12340,17 +12353,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc @@ -12669,30 +12683,30 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -12724,16 +12738,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc @@ -12766,17 +12781,18 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 6f1675edbe58a..c5d8267f9ec35 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -7797,31 +7797,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc @@ -7841,27 +7841,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_max_f32_e32 v6, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -7883,28 +7884,29 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_max_f32_e32 v6, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -8313,30 +8315,30 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -8357,36 +8359,37 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_max_f32_e32 v6, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8399,14 +8402,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8414,22 +8417,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_max_f32_e32 v6, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9127,13 +9131,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v8 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 @@ -9141,16 +9145,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v9 ; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 @@ -9202,27 +9206,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX7-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_max_f32_e32 v4, v7, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 @@ -9276,27 +9281,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX6-NEXT: v_max_f32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_max_f32_e32 v4, v7, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index acb27be1846b9..7dc5de97179f4 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -7797,31 +7797,31 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc @@ -7841,27 +7841,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_min_f32_e32 v6, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v1 ; GFX7-NEXT: v_mov_b32_e32 v5, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -7883,28 +7884,29 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 ; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_min_f32_e32 v6, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -8313,30 +8315,30 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 ; GFX8-NEXT: s_add_i32 s4, s20, 0x400 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v6, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v0 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc @@ -8357,36 +8359,37 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX7-NEXT: s_add_i32 s6, s20, 0x400 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX7-NEXT: v_min_f32_e32 v6, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v5, v3 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB20_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8399,14 +8402,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 ; GFX6-NEXT: s_add_i32 s6, s20, 0x400 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8414,22 +8417,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 +; GFX6-NEXT: v_min_f32_e32 v6, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, v3 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB20_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9127,13 +9131,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: ; %bb.2: ; GFX8-NEXT: s_mov_b64 exec, s[6:7] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5 ; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 @@ -9141,16 +9145,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v9 ; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, v5 ; GFX8-NEXT: s_mov_b64 s[12:13], exec ; GFX8-NEXT: v_mov_b32_e32 v5, v6 @@ -9202,27 +9206,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX7-NEXT: s_cbranch_execnz .LBB21_1 ; GFX7-NEXT: ; %bb.2: ; GFX7-NEXT: s_mov_b64 exec, s[6:7] -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: s_mov_b64 s[6:7], 0 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Loop Header: Depth=1 ; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX7-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX7-NEXT: v_min_f32_e32 v4, v7, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: s_mov_b64 s[12:13], exec ; GFX7-NEXT: v_mov_b32_e32 v6, v4 @@ -9276,27 +9281,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX6-NEXT: s_cbranch_execnz .LBB21_1 ; GFX6-NEXT: ; %bb.2: ; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: s_mov_b64 s[6:7], 0 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 ; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Loop Header: Depth=1 ; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GFX6-NEXT: v_min_f32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v6, v9 +; GFX6-NEXT: v_min_f32_e32 v4, v7, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: s_mov_b64 s[12:13], exec ; GFX6-NEXT: v_mov_b32_e32 v6, v4 diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll index 5c7172ff8d047..79d5251361f58 100644 --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -162,8 +162,8 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) { ; GFX8-LABEL: undef_lo2_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] ; GFX8-NEXT: ;;#ASMEND @@ -187,8 +187,8 @@ define void @undef_lo2_v4f16(<2 x half> %arg0) { ; GFX8-LABEL: undef_lo2_v4f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] ; GFX8-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 7208eaeff8eb1..3976151db89e6 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -189,7 +189,9 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_alignbit_b32 v0, 5, s6, 16 +; GFX6-NEXT: s_lshr_b32 s4, s6, 16 +; GFX6-NEXT: s_or_b32 s4, s4, 0x50000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 86e890b06989a..4c4b7a2d9a969 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1461,30 +1461,30 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:2 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7] -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[12:15], 0 addr64 offset:2 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v4 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; SI-NEXT: v_or_b32_e32 v6, v3, v2 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 85e56a243cdc9..acab8c6b44e77 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -480,13 +480,14 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 +; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -534,8 +535,9 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { ; GCN-LABEL: divergent_vec_i16_HH: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: divergent_vec_i16_HH: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index d8f81db70e309..5d1023fc9249d 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -220,16 +220,18 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s4, s3, 0xffff0000 ; CI-NEXT: s_lshl_b32 s3, s3, 16 -; CI-NEXT: s_and_b32 s5, s2, 0xffff0000 ; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4| ; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3| -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5| -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: s_and_b32 s5, s2, 0xffff0000 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v1, v0 +; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s5| ; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2| -; CI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v0, v2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -537,16 +539,17 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; CI-NEXT: v_mul_f32_e64 v4, 1.0, |v3| ; CI-NEXT: v_mul_f32_e64 v5, 1.0, |v2| -; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; CI-NEXT: v_mul_f32_e32 v3, v4, v3 +; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; CI-NEXT: v_mul_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v4, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -585,8 +588,8 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -716,18 +719,19 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v2, v[0:1] -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_and_b32 s1, s4, 0xffff0000 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshl_b32 s0, s4, 16 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff0000 +; CI-NEXT: s_lshl_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0x7fff0000, v2 -; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v3, s1, v3 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v2 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_mul_f32_e32 v2, s0, v2 +; CI-NEXT: v_mul_f32_e32 v3, s1, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -767,8 +771,8 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 450d66767600b..0288524db268b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -3975,20 +3975,20 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %ma ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4116,42 +4116,42 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %m ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_f64_e32 v7, v[0:1] -; GFX8-NEXT: v_cvt_f32_f64_e32 v8, v[2:3] +; GFX8-NEXT: v_cvt_f32_f64_e32 v7, v[2:3] +; GFX8-NEXT: v_cvt_f32_f64_e32 v8, v[0:1] ; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v7 ; GFX8-NEXT: v_and_b32_e32 v9, 1, v7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]| -; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6] +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]| +; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6] ; GFX8-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[6:7] ; GFX8-NEXT: v_add_u32_e64 v5, s[6:7], v7, v5 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc ; GFX8-NEXT: v_bfe_u32 v5, v7, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v5, v7 -; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 -; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]| -; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v5 +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] +; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v9, v7, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7] -; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], v8, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3 +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]| +; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6] +; GFX8-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v3, -1, 1, s[6:7] +; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], v8, v3 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] -; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4371,18 +4371,18 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -4850,26 +4850,27 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in ; ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x10010 -; GFX8-NEXT: s_add_i32 s4, s4, s0 -; GFX8-NEXT: s_or_b32 s3, s0, 0x400000 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010 +; GFX8-NEXT: s_add_i32 s4, s4, s1 +; GFX8-NEXT: s_or_b32 s3, s1, 0x400000 ; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s0, s0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s3, s3, s6 -; GFX8-NEXT: s_bfe_u32 s0, s1, 0x10010 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s4, s1, 0x400000 -; GFX8-NEXT: s_add_i32 s5, s0, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s1, s1 +; GFX8-NEXT: s_cselect_b32 s1, s3, s6 +; GFX8-NEXT: s_and_b32 s3, s1, 0x7fff0000 +; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010 +; GFX8-NEXT: s_add_i32 s1, s1, s0 +; GFX8-NEXT: s_or_b32 s4, s0, 0x400000 +; GFX8-NEXT: s_add_i32 s5, s1, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX8-NEXT: s_cselect_b32 s0, s4, s5 ; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -4995,52 +4996,53 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i ; ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] -; GFX8-NEXT: v_cvt_f32_f64_e32 v3, s[2:3] -; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], s[0:1], s[0:1] +; GFX8-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] +; GFX8-NEXT: v_cvt_f32_f64_e32 v3, s[0:1] +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], s[2:3], s[2:3] ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; GFX8-NEXT: v_readfirstlane_b32 s5, v2 ; GFX8-NEXT: s_bitcmp1_b32 s5, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], |s[0:1]|, |v[0:1]| +; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, s[2:3], v[0:1] +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |s[2:3]|, |v[0:1]| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; GFX8-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[2:3], v[0:1] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[12:13], |s[2:3]|, |v[0:1]| +; GFX8-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[0:1], v[0:1] ; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX8-NEXT: s_cselect_b32 s6, 1, -1 +; GFX8-NEXT: s_add_i32 s12, s5, s6 +; GFX8-NEXT: s_and_b64 s[6:7], s[10:11], exec +; GFX8-NEXT: s_cselect_b32 s5, s5, s12 +; GFX8-NEXT: s_bfe_u32 s6, s5, 0x10010 +; GFX8-NEXT: s_or_b32 s10, s5, 0x400000 +; GFX8-NEXT: s_add_i32 s5, s6, s5 +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |s[0:1]|, |v[0:1]| +; GFX8-NEXT: s_addk_i32 s5, 0x7fff ; GFX8-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX8-NEXT: s_cselect_b32 s8, 1, -1 -; GFX8-NEXT: s_add_i32 s14, s5, s8 -; GFX8-NEXT: s_and_b64 s[8:9], s[10:11], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s14 -; GFX8-NEXT: s_bfe_u32 s8, s5, 0x10010 -; GFX8-NEXT: s_add_i32 s8, s8, s5 -; GFX8-NEXT: s_addk_i32 s8, 0x7fff -; GFX8-NEXT: s_bitset1_b32 s5, 22 +; GFX8-NEXT: s_cselect_b32 s5, s10, s5 +; GFX8-NEXT: s_and_b32 s5, s5, 0x7fff0000 +; GFX8-NEXT: v_readfirstlane_b32 s10, v3 +; GFX8-NEXT: s_bitcmp1_b32 s10, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: v_readfirstlane_b32 s8, v3 -; GFX8-NEXT: s_bitcmp1_b32 s8, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX8-NEXT: s_and_b64 s[6:7], s[12:13], exec -; GFX8-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3] +; GFX8-NEXT: v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1] ; GFX8-NEXT: s_cselect_b32 s6, 1, -1 -; GFX8-NEXT: s_add_i32 s6, s8, s6 +; GFX8-NEXT: s_add_i32 s6, s10, s6 +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s10, s6 +; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010 +; GFX8-NEXT: s_add_i32 s3, s3, s2 +; GFX8-NEXT: s_addk_i32 s3, 0x7fff +; GFX8-NEXT: s_bitset1_b32 s2, 22 ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX8-NEXT: s_cselect_b32 s0, s8, s6 -; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010 -; GFX8-NEXT: s_add_i32 s1, s1, s0 -; GFX8-NEXT: s_add_i32 s6, s1, 0x7fff -; GFX8-NEXT: s_or_b32 s7, s0, 0x400000 -; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], exec -; GFX8-NEXT: s_cselect_b32 s0, s7, s6 +; GFX8-NEXT: s_cselect_b32 s0, s2, s3 ; GFX8-NEXT: s_lshr_b32 s0, s0, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5249,24 +5251,25 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i ; ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10010 -; GFX8-NEXT: s_add_i32 s3, s3, s1 -; GFX8-NEXT: s_addk_i32 s3, 0x7fff -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 -; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 ; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010 ; GFX8-NEXT: s_add_i32 s3, s3, s2 ; GFX8-NEXT: s_addk_i32 s3, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s2, s2 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s2, v0, 16 -; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 +; GFX8-NEXT: s_and_b32 s4, s2, 0x80000000 +; GFX8-NEXT: s_bfe_u32 s2, s1, 0x10010 +; GFX8-NEXT: s_add_i32 s2, s2, s1 +; GFX8-NEXT: s_add_i32 s5, s2, 0x7fff +; GFX8-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_mov_b32 s2, 0x7fff7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -5779,9 +5782,9 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16(<3 x float> %ma ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v3 ; GFX8-NEXT: v_bfi_b32 v1, s4, v2, v4 @@ -6004,8 +6007,8 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16(<3 x double> %m ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v6 ; GFX8-NEXT: v_bfi_b32 v1, s4, v10, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -6315,8 +6318,9 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 @@ -6324,9 +6328,9 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v4 @@ -6991,11 +6995,11 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16(<4 x float> %ma ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v3 ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff -; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -7229,19 +7233,18 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16(<4 x double> %m ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v13, vcc ; GFX8-NEXT: v_bfe_u32 v4, v10, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, s8, v4 -; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v4, v10 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v11 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s8, v13 +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[6:7], v[6:7] +; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| +; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; GFX8-NEXT: v_and_b32_e32 v7, 1, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v13, v10, s[4:5] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| -; GFX8-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc -; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cvt_f32_f64_e32 v10, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; GFX8-NEXT: v_cvt_f32_f64_e32 v10, v[2:3] ; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], v11, v4 ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc @@ -7267,9 +7270,10 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16(<4 x double> %m ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX8-NEXT: v_alignbit_b32 v1, v6, v12, 16 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v6 +; GFX8-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v8 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v9 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -7675,12 +7679,12 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX8-NEXT: v_and_b32_e32 v3, 0x80000000, v3 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v5 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 8c7d5cffe39d9..97a52b434cb9b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -18886,30 +18886,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18936,16 +18936,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -19215,30 +19216,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19266,16 +19267,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -19562,30 +19564,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19613,16 +19615,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -19881,29 +19884,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19930,16 +19933,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -20201,29 +20205,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -20252,16 +20256,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -20545,29 +20550,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -20596,16 +20601,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -20877,30 +20883,30 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -20928,16 +20934,17 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -21203,29 +21210,29 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -21254,16 +21261,17 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -21529,30 +21537,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -21579,16 +21587,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory( ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -21848,29 +21857,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -21897,16 +21906,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -22171,30 +22181,30 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -22221,16 +22231,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -22490,29 +22501,29 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -22539,16 +22550,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_ ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 56ad91dd59ffb..4cac44b9fd21d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -16534,30 +16534,30 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16574,26 +16574,27 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -16988,30 +16989,30 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17029,8 +17030,8 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -17039,16 +17040,17 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -17464,30 +17466,30 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17505,8 +17507,8 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -17515,16 +17517,17 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -17904,29 +17907,29 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17942,34 +17945,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18345,29 +18349,29 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18385,34 +18389,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18813,29 +18818,29 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18853,34 +18858,35 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19271,30 +19277,30 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19312,8 +19318,8 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -19322,16 +19328,17 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -19719,29 +19726,29 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19759,34 +19766,35 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index f0083bd23660a..85194c53082a0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -16534,30 +16534,30 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16574,26 +16574,27 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v5, v[0:1] -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -16988,30 +16989,30 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17029,8 +17030,8 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -17039,16 +17040,17 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -17464,30 +17466,30 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17505,8 +17507,8 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -17515,16 +17517,17 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -17904,29 +17907,29 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17942,34 +17945,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18345,29 +18349,29 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18385,34 +18389,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18813,29 +18818,29 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18853,34 +18858,35 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19271,30 +19277,30 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19312,8 +19318,8 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 @@ -19322,16 +19328,17 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -19719,29 +19726,29 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19759,34 +19766,35 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 -; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 3ee0bb2122abe..64320ceab2f0c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -15963,30 +15963,30 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16013,16 +16013,17 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -16417,30 +16418,30 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16468,16 +16469,17 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -16893,30 +16895,30 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16944,16 +16946,17 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -17333,29 +17336,29 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17382,16 +17385,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -17774,29 +17778,29 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17825,16 +17829,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -18242,29 +18247,29 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18293,16 +18298,17 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -18700,30 +18706,30 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18751,16 +18757,17 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 @@ -19148,29 +19155,29 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19199,16 +19206,17 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 64a9727330cfd..3320a21097a3c 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -435,19 +435,22 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: s_lshl_b32 s2, s2, 16 ; CI-NEXT: v_add_f32_e64 v0, s3, 2.0 ; CI-NEXT: v_add_f32_e64 v1, s2, 1.0 -; CI-NEXT: v_readfirstlane_b32 s2, v0 +; CI-NEXT: v_readfirstlane_b32 s2, v1 +; CI-NEXT: v_readfirstlane_b32 s3, v0 ; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: s_and_b32 s3, s3, 0xffff0000 ; CI-NEXT: s_bitset0_b32 s2, 31 -; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 +; CI-NEXT: s_bitset0_b32 s3, 31 +; CI-NEXT: s_and_b32 s3, s3, 0xffff0000 ; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 +; CI-NEXT: s_xor_b32 s3, s3, 0x80000000 ; CI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: s_and_b32 s3, s3, 0xffff0000 ; CI-NEXT: s_lshr_b32 s2, s2, 16 -; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; CI-NEXT: v_alignbit_b32 v2, s2, v0, 16 +; CI-NEXT: s_or_b32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -459,24 +462,24 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: v_add_f32_e64 v0, s3, 1.0 +; VI-NEXT: s_and_b32 s3, s2, 0xffff0000 +; VI-NEXT: v_add_f32_e64 v0, s3, 2.0 ; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 ; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 +; VI-NEXT: s_lshl_b32 s2, s2, 16 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: v_add_f32_e64 v1, s2, 2.0 +; VI-NEXT: v_add_f32_e64 v1, s2, 1.0 ; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -570,9 +573,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000 ; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 ; CI-NEXT: s_lshl_b32 s2, s3, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v2, v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -631,24 +635,26 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b32 s4, s2, 16 ; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 +; CI-NEXT: s_lshl_b32 s5, s3, 16 ; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2| ; CI-NEXT: s_and_b32 s2, s3, 0xffff0000 -; CI-NEXT: s_lshl_b32 s5, s3, 16 -; CI-NEXT: v_mul_f32_e64 v3, 1.0, |s2| ; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4| ; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s5| +; CI-NEXT: v_mul_f32_e64 v3, 1.0, |s2| ; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; CI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; CI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; CI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v1, v1, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -730,13 +736,14 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff0000 -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, s3, -4.0 +; CI-NEXT: s_and_b32 s3, s2, 0x7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0 +; CI-NEXT: s_lshl_b32 s2, s3, 16 ; CI-NEXT: v_mul_f32_e64 v1, s2, -4.0 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v2, v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -769,8 +776,8 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2 ; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -861,16 +868,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0, ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: s_and_b32 s1, s4, 0x7fff +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v4, -1.0, s2 ; CI-NEXT: s_lshl_b32 s1, s1, 16 +; CI-NEXT: v_mul_f32_e64 v4, -1.0, s2 +; CI-NEXT: v_mul_f32_e64 v5, -1.0, s1 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_mul_f32_e64 v5, -1.0, s1 -; CI-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_or_b32_e32 v4, v5, v4 ; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_store_dword v[0:1], v5 @@ -943,16 +951,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_and_b32 s1, s4, 0x7fff +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000 ; CI-NEXT: s_lshl_b32 s1, s1, 16 -; CI-NEXT: v_mul_f32_e64 v4, s1, -4.0 -; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000 +; CI-NEXT: v_mul_f32_e64 v4, s2, -4.0 ; CI-NEXT: v_mul_f32_e64 v5, s1, -4.0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff +; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; CI-NEXT: v_or_b32_e32 v4, v5, v4 ; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_store_dword v[0:1], v5 ; CI-NEXT: flat_store_dword v[2:3], v4 @@ -988,8 +997,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa ; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 1b092b283290a..f73625132998d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1382,14 +1382,16 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index d232693b46ad9..fc976fba4ebab 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -368,8 +368,9 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in ; CI-NEXT: s_lshl_b32 s2, s2, 16 ; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 ; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v2, v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -426,11 +427,12 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; CI-NEXT: ; def s2 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 ; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 ; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 @@ -505,9 +507,10 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_mul_f32_e32 v3, -1.0, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -573,8 +576,9 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-NEXT: s_lshl_b32 s2, s2, 16 ; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 ; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v2, v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -637,16 +641,17 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; CI-NEXT: v_mul_f32_e32 v4, -1.0, v3 ; CI-NEXT: v_mul_f32_e32 v5, -1.0, v2 -; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; CI-NEXT: v_mul_f32_e32 v3, v4, v3 +; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; CI-NEXT: v_mul_f32_e32 v2, v5, v2 +; CI-NEXT: v_mul_f32_e32 v3, v4, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -682,8 +687,8 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index 9a347d71bf430..7f83f8a25ec3e 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -7017,14 +7017,15 @@ define void @freeze_v3bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-SDAG-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -7055,14 +7056,15 @@ define void @freeze_v3bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-SDAG-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 43caa4c739fb3..816697cc476e2 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -18,12 +18,12 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_lshr_b32 s1, s0, 1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_not_b32 s0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 +; SI-NEXT: s_lshl_b32 s0, s0, s2 +; SI-NEXT: s_lshr_b32 s1, s1, 1 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: s_lshr_b32 s1, s1, s2 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -32,14 +32,14 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_lshl_b32 s0, s0, s2 +; VI-NEXT: s_lshr_b32 s1, s1, 1 ; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: s_lshr_b32 s1, s0, 1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: s_lshr_b32 s1, s1, s2 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -49,12 +49,12 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: s_lshr_b32 s1, s0, 1 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,13 +77,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s1, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_not_b32 s3, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: @@ -91,14 +93,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 -; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) @@ -113,10 +116,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 +; SI-NEXT: s_lshr_b32 s0, s3, 25 +; SI-NEXT: s_lshl_b32 s1, s2, 7 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -124,10 +129,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 +; VI-NEXT: s_lshr_b32 s3, s3, 25 +; VI-NEXT: s_lshl_b32 s2, s2, 7 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -136,8 +143,10 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 +; GFX9-NEXT: s_lshr_b32 s3, s3, 25 +; GFX9-NEXT: s_lshl_b32 s2, s2, 7 +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -158,16 +167,22 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX10-NEXT: s_lshr_b32 s3, s3, 25 +; GFX10-NEXT: s_lshl_b32 s2, s2, 7 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX11-NEXT: s_lshr_b32 s3, s3, 25 +; GFX11-NEXT: s_lshl_b32 s2, s2, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -179,70 +194,70 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; SI-NEXT: s_not_b32 s3, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_not_b32 s1, s4 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_lshl_b32 s1, s1, s9 +; SI-NEXT: s_lshr_b32 s3, s3, 1 +; SI-NEXT: s_not_b32 s9, s9 +; SI-NEXT: s_lshr_b32 s3, s3, s9 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: s_lshr_b32 s2, s2, 1 +; SI-NEXT: s_not_b32 s3, s8 +; SI-NEXT: s_lshl_b32 s0, s0, s8 +; SI-NEXT: s_lshr_b32 s2, s2, s3 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_lshl_b32 s1, s1, s7 +; VI-NEXT: s_lshr_b32 s3, s3, 1 ; VI-NEXT: s_not_b32 s7, s7 -; VI-NEXT: s_lshr_b32 s3, s1, 1 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_not_b32 s1, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: s_lshr_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; VI-NEXT: s_lshr_b32 s3, s3, s7 +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_lshr_b32 s2, s2, 1 +; VI-NEXT: s_not_b32 s3, s6 +; VI-NEXT: s_lshl_b32 s0, s0, s6 +; VI-NEXT: s_lshr_b32 s2, s2, s3 +; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; GFX9-NEXT: s_not_b32 s1, s9 +; GFX9-NEXT: s_not_b32 s4, s7 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, s7 +; GFX9-NEXT: s_lshr_b32 s3, s3, s4 +; GFX9-NEXT: s_or_b32 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_not_b32 s3, s6 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_not_b32 s1, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -266,38 +281,46 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_not_b32 s2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX10-NEXT: s_not_b32 s4, s7 +; GFX10-NEXT: s_lshr_b32 s3, s3, 1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_not_b32 s5, s6 +; GFX10-NEXT: s_lshl_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_lshr_b32 s2, s2, s5 +; GFX10-NEXT: s_lshr_b32 s3, s3, s4 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_not_b32 s2, s7 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s3, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX11-NEXT: s_not_b32 s8, s6 +; GFX11-NEXT: s_lshl_b32 s1, s1, s7 +; GFX11-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-NEXT: s_not_b32 s7, s7 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_lshr_b32 s2, s2, s8 +; GFX11-NEXT: s_lshr_b32 s3, s3, s7 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -314,10 +337,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; SI-NEXT: s_lshr_b32 s3, s3, 23 +; SI-NEXT: s_lshl_b32 s1, s1, 9 +; SI-NEXT: s_lshr_b32 s2, s2, 25 +; SI-NEXT: s_lshl_b32 s0, s0, 7 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -326,11 +353,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; VI-NEXT: s_lshr_b32 s3, s3, 23 +; VI-NEXT: s_lshl_b32 s1, s1, 9 +; VI-NEXT: s_lshr_b32 s2, s2, 25 +; VI-NEXT: s_lshl_b32 s0, s0, 7 +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -341,10 +372,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 +; GFX9-NEXT: s_lshr_b32 s3, s3, 23 +; GFX9-NEXT: s_lshl_b32 s1, s1, 9 +; GFX9-NEXT: s_lshr_b32 s2, s2, 25 +; GFX9-NEXT: s_lshl_b32 s0, s0, 7 +; GFX9-NEXT: s_or_b32 s1, s1, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -369,8 +404,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX10-NEXT: s_lshr_b32 s3, s3, 23 +; GFX10-NEXT: s_lshr_b32 s2, s2, 25 +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: s_lshl_b32 s1, s1, 9 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -379,10 +420,16 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 +; GFX11-NEXT: s_lshr_b32 s3, s3, 23 +; GFX11-NEXT: s_lshr_b32 s2, s2, 25 +; GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-NEXT: s_lshl_b32 s1, s1, 9 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -395,36 +442,36 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-LABEL: fshl_v4i32: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_not_b32 s5, s19 -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s11, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: s_not_b32 s5, s18 -; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: s_not_b32 s5, s17 -; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s9, 1 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: s_not_b32 s5, s16 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; SI-NEXT: s_lshr_b32 s4, s8, 1 -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_lshr_b32 s15, s15, 1 +; SI-NEXT: s_lshl_b32 s11, s11, s3 +; SI-NEXT: s_not_b32 s3, s3 +; SI-NEXT: s_lshr_b32 s3, s15, s3 +; SI-NEXT: s_or_b32 s3, s11, s3 +; SI-NEXT: s_lshl_b32 s10, s10, s2 +; SI-NEXT: s_lshr_b32 s11, s14, 1 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: s_lshr_b32 s2, s11, s2 +; SI-NEXT: s_or_b32 s2, s10, s2 +; SI-NEXT: s_lshl_b32 s9, s9, s1 +; SI-NEXT: s_lshr_b32 s10, s13, 1 +; SI-NEXT: s_not_b32 s1, s1 +; SI-NEXT: s_lshr_b32 s1, s10, s1 +; SI-NEXT: s_or_b32 s1, s9, s1 +; SI-NEXT: s_lshl_b32 s8, s8, s0 +; SI-NEXT: s_lshr_b32 s9, s12, 1 +; SI-NEXT: s_not_b32 s0, s0 +; SI-NEXT: s_lshr_b32 s0, s9, s0 +; SI-NEXT: s_or_b32 s0, s8, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v4i32: @@ -433,31 +480,31 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: s_lshr_b32 s7, s15, 1 +; VI-NEXT: s_lshl_b32 s6, s11, s3 ; VI-NEXT: s_not_b32 s3, s3 -; VI-NEXT: s_lshr_b32 s6, s11, 1 -; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s14 +; VI-NEXT: s_lshr_b32 s3, s7, s3 +; VI-NEXT: s_or_b32 s3, s6, s3 +; VI-NEXT: s_lshl_b32 s6, s10, s2 +; VI-NEXT: s_lshr_b32 s7, s14, 1 ; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 -; VI-NEXT: s_lshr_b32 s3, s10, 1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s13 +; VI-NEXT: s_lshr_b32 s2, s7, s2 +; VI-NEXT: s_or_b32 s2, s6, s2 +; VI-NEXT: s_lshl_b32 s6, s9, s1 +; VI-NEXT: s_lshr_b32 s7, s13, 1 ; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; VI-NEXT: s_lshr_b32 s2, s9, 1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_lshr_b32 s1, s7, s1 +; VI-NEXT: s_or_b32 s1, s6, s1 +; VI-NEXT: s_lshl_b32 s6, s8, s0 +; VI-NEXT: s_lshr_b32 s7, s12, 1 ; VI-NEXT: s_not_b32 s0, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; VI-NEXT: s_lshr_b32 s1, s8, 1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 +; VI-NEXT: s_lshr_b32 s0, s7, s0 +; VI-NEXT: s_or_b32 s0, s6, s0 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -469,30 +516,30 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s5, s15, 1 +; GFX9-NEXT: s_lshl_b32 s4, s11, s3 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: s_lshr_b32 s4, s11, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s3, s5, s3 +; GFX9-NEXT: s_or_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s4, s10, s2 +; GFX9-NEXT: s_lshr_b32 s5, s14, 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 -; GFX9-NEXT: s_lshr_b32 s3, s10, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s2, s5, s2 +; GFX9-NEXT: s_or_b32 s2, s4, s2 +; GFX9-NEXT: s_lshl_b32 s4, s9, s1 +; GFX9-NEXT: s_lshr_b32 s5, s13, 1 ; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 -; GFX9-NEXT: s_lshr_b32 s2, s9, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s1, s5, s1 +; GFX9-NEXT: s_or_b32 s1, s4, s1 +; GFX9-NEXT: s_lshl_b32 s4, s8, s0 +; GFX9-NEXT: s_lshr_b32 s5, s12, 1 ; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 -; GFX9-NEXT: s_lshr_b32 s1, s8, 1 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 +; GFX9-NEXT: s_lshr_b32 s0, s5, s0 +; GFX9-NEXT: s_or_b32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -530,22 +577,30 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX10-NEXT: s_lshr_b32 s4, s11, 1 +; GFX10-NEXT: s_lshr_b32 s5, s15, 1 +; GFX10-NEXT: s_lshl_b32 s4, s11, s3 ; GFX10-NEXT: s_not_b32 s3, s3 -; GFX10-NEXT: s_lshr_b32 s5, s10, 1 +; GFX10-NEXT: s_lshl_b32 s10, s10, s2 +; GFX10-NEXT: s_lshr_b32 s11, s14, 1 ; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: s_lshr_b32 s9, s9, 1 +; GFX10-NEXT: s_lshl_b32 s9, s9, s1 +; GFX10-NEXT: s_lshr_b32 s13, s13, 1 ; GFX10-NEXT: s_not_b32 s1, s1 -; GFX10-NEXT: s_lshr_b32 s8, s8, 1 +; GFX10-NEXT: s_lshl_b32 s8, s8, s0 +; GFX10-NEXT: s_lshr_b32 s12, s12, 1 ; GFX10-NEXT: s_not_b32 s0, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3 -; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX10-NEXT: s_lshr_b32 s3, s5, s3 +; GFX10-NEXT: s_lshr_b32 s2, s11, s2 +; GFX10-NEXT: s_lshr_b32 s1, s13, s1 +; GFX10-NEXT: s_lshr_b32 s0, s12, s0 +; GFX10-NEXT: s_or_b32 s3, s4, s3 +; GFX10-NEXT: s_or_b32 s2, s10, s2 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -555,24 +610,31 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1 -; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1 -; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1 -; GFX11-NEXT: s_lshr_b32 s6, s11, 1 +; GFX11-NEXT: s_lshr_b32 s7, s15, 1 +; GFX11-NEXT: s_lshl_b32 s6, s11, s3 ; GFX11-NEXT: s_not_b32 s3, s3 -; GFX11-NEXT: s_lshr_b32 s7, s10, 1 +; GFX11-NEXT: s_lshl_b32 s10, s10, s2 +; GFX11-NEXT: s_lshr_b32 s11, s14, 1 ; GFX11-NEXT: s_not_b32 s2, s2 -; GFX11-NEXT: s_lshr_b32 s9, s9, 1 +; GFX11-NEXT: s_lshl_b32 s9, s9, s1 +; GFX11-NEXT: s_lshr_b32 s13, s13, 1 ; GFX11-NEXT: s_not_b32 s1, s1 -; GFX11-NEXT: s_lshr_b32 s8, s8, 1 +; GFX11-NEXT: s_lshl_b32 s8, s8, s0 +; GFX11-NEXT: s_lshr_b32 s12, s12, 1 ; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3 -; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1 -; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0 +; GFX11-NEXT: s_lshr_b32 s3, s7, s3 +; GFX11-NEXT: s_lshr_b32 s2, s11, s2 +; GFX11-NEXT: s_lshr_b32 s1, s13, s1 +; GFX11-NEXT: s_lshr_b32 s0, s12, s0 +; GFX11-NEXT: s_or_b32 s3, s6, s3 +; GFX11-NEXT: s_or_b32 s2, s10, s2 +; GFX11-NEXT: s_or_b32 s0, s8, s0 +; GFX11-NEXT: s_or_b32 s1, s9, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -589,14 +651,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; SI-NEXT: s_lshr_b32 s4, s15, 31 +; SI-NEXT: s_lshl_b32 s5, s11, 1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshr_b32 s5, s14, 23 +; SI-NEXT: s_lshl_b32 s6, s10, 9 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshr_b32 s6, s13, 25 +; SI-NEXT: s_lshl_b32 s7, s9, 7 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_lshr_b32 s7, s12, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 1 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -605,15 +675,23 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_lshr_b32 s2, s15, 31 +; VI-NEXT: s_lshl_b32 s3, s11, 1 +; VI-NEXT: s_lshr_b32 s4, s14, 23 +; VI-NEXT: s_lshl_b32 s5, s10, 9 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s3, s5, s4 +; VI-NEXT: s_lshr_b32 s4, s13, 25 +; VI-NEXT: s_lshl_b32 s5, s9, 7 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_lshr_b32 s5, s12, 31 +; VI-NEXT: s_lshl_b32 s6, s8, 1 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -624,14 +702,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 +; GFX9-NEXT: s_lshr_b32 s2, s15, 31 +; GFX9-NEXT: s_lshl_b32 s3, s11, 1 +; GFX9-NEXT: s_lshr_b32 s4, s14, 23 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s3, s10, 9 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s13, 25 +; GFX9-NEXT: s_lshl_b32 s5, s9, 7 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_lshr_b32 s5, s12, 31 +; GFX9-NEXT: s_lshl_b32 s6, s8, 1 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -660,10 +746,22 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX10-NEXT: s_lshr_b32 s2, s15, 31 +; GFX10-NEXT: s_lshl_b32 s3, s11, 1 +; GFX10-NEXT: s_lshr_b32 s4, s14, 23 +; GFX10-NEXT: s_lshl_b32 s5, s10, 9 +; GFX10-NEXT: s_lshr_b32 s6, s13, 25 +; GFX10-NEXT: s_lshl_b32 s7, s9, 7 +; GFX10-NEXT: s_lshr_b32 s9, s12, 31 +; GFX10-NEXT: s_lshl_b32 s8, s8, 1 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s3, s5, s4 +; GFX10-NEXT: s_or_b32 s4, s8, s9 +; GFX10-NEXT: s_or_b32 s5, s7, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -672,12 +770,23 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31 +; GFX11-NEXT: s_lshr_b32 s2, s15, 31 +; GFX11-NEXT: s_lshl_b32 s3, s11, 1 +; GFX11-NEXT: s_lshr_b32 s4, s14, 23 +; GFX11-NEXT: s_lshl_b32 s5, s10, 9 +; GFX11-NEXT: s_lshr_b32 s6, s13, 25 +; GFX11-NEXT: s_lshl_b32 s7, s9, 7 +; GFX11-NEXT: s_lshr_b32 s9, s12, 31 +; GFX11-NEXT: s_lshl_b32 s8, s8, 1 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_or_b32 s4, s8, s9 +; GFX11-NEXT: s_or_b32 s5, s7, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 4a79096442c96..123b5f8f74115 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -30,9 +30,12 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; SI-NEXT: s_lshr_b32 s1, s1, s2 +; SI-NEXT: s_lshl_b32 s0, s0, 1 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: s_lshl_b32 s0, s0, s2 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -41,11 +44,14 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 +; VI-NEXT: s_lshr_b32 s1, s1, s2 +; VI-NEXT: s_lshl_b32 s0, s0, 1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_lshl_b32 s0, s0, s2 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -55,9 +61,12 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -77,62 +86,48 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_not_b32 s3, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX11-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x1 -; GFX12-TRUE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s1, v0.l -; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x1 -; GFX12-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, v0 -; GFX12-FAKE16-NEXT: global_store_b32 v1, v0, s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_not_b32 s3, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-NEXT: s_not_b32 s3, s2 +; GFX12-NEXT: s_lshr_b32 s1, s1, s2 +; GFX12-NEXT: s_lshl_b32 s0, s0, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s0, s0, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) store i32 %0, ptr addrspace(1) %in @@ -146,10 +141,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 +; SI-NEXT: s_lshr_b32 s0, s3, 7 +; SI-NEXT: s_lshl_b32 s1, s2, 25 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -157,10 +154,12 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 +; VI-NEXT: s_lshr_b32 s3, s3, 7 +; VI-NEXT: s_lshl_b32 s2, s2, 25 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -169,8 +168,10 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 +; GFX9-NEXT: s_lshr_b32 s3, s3, 7 +; GFX9-NEXT: s_lshl_b32 s2, s2, 25 +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -191,25 +192,34 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX10-NEXT: s_lshr_b32 s3, s3, 7 +; GFX10-NEXT: s_lshl_b32 s2, s2, 25 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX11-NEXT: s_lshr_b32 s3, s3, 7 +; GFX11-NEXT: s_lshl_b32 s2, s2, 25 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_i32_imm: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX12-NEXT: s_lshr_b32 s3, s3, 7 +; GFX12-NEXT: s_lshl_b32 s2, s2, 25 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s2, s2, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -221,51 +231,69 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; SI-NEXT: s_lshr_b32 s3, s3, s9 +; SI-NEXT: s_lshl_b32 s1, s1, 1 +; SI-NEXT: s_not_b32 s9, s9 +; SI-NEXT: s_lshl_b32 s1, s1, s9 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: s_lshl_b32 s0, s0, 1 +; SI-NEXT: s_not_b32 s3, s8 +; SI-NEXT: s_lshr_b32 s2, s2, s8 +; SI-NEXT: s_lshl_b32 s0, s0, s3 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 +; VI-NEXT: s_lshr_b32 s3, s3, s7 +; VI-NEXT: s_lshl_b32 s1, s1, 1 +; VI-NEXT: s_not_b32 s7, s7 +; VI-NEXT: s_lshl_b32 s1, s1, s7 +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 1 +; VI-NEXT: s_not_b32 s3, s6 +; VI-NEXT: s_lshr_b32 s2, s2, s6 +; VI-NEXT: s_lshl_b32 s0, s0, s3 +; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: s_not_b32 s4, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s3, s3, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_or_b32 s1, s1, s3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_not_b32 s3, s6 +; GFX9-NEXT: s_lshr_b32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -289,76 +317,68 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] +; GFX10-NEXT: s_not_b32 s4, s7 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_not_b32 s5, s6 +; GFX10-NEXT: s_lshr_b32 s3, s3, s7 +; GFX10-NEXT: s_lshr_b32 s2, s2, s6 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s4 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v2i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v2i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v2i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v2i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v2i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_not_b32 s8, s6 +; GFX11-NEXT: s_lshr_b32 s3, s3, s7 +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: s_not_b32 s7, s7 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshr_b32 s2, s2, s6 +; GFX11-NEXT: s_lshl_b32 s0, s0, s8 +; GFX11-NEXT: s_lshl_b32 s1, s1, s7 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_not_b32 s8, s6 +; GFX12-NEXT: s_lshr_b32 s3, s3, s7 +; GFX12-NEXT: s_lshl_b32 s1, s1, 1 +; GFX12-NEXT: s_not_b32 s7, s7 +; GFX12-NEXT: s_lshl_b32 s0, s0, 1 +; GFX12-NEXT: s_lshr_b32 s2, s2, s6 +; GFX12-NEXT: s_lshl_b32 s0, s0, s8 +; GFX12-NEXT: s_lshl_b32 s1, s1, s7 +; GFX12-NEXT: s_or_b32 s0, s0, s2 +; GFX12-NEXT: s_or_b32 s1, s1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, ptr addrspace(1) %in @@ -373,10 +393,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; SI-NEXT: s_lshr_b32 s3, s3, 9 +; SI-NEXT: s_lshl_b32 s1, s1, 23 +; SI-NEXT: s_lshr_b32 s2, s2, 7 +; SI-NEXT: s_lshl_b32 s0, s0, 25 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -385,11 +409,15 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; VI-NEXT: s_lshr_b32 s3, s3, 9 +; VI-NEXT: s_lshl_b32 s1, s1, 23 +; VI-NEXT: s_lshr_b32 s2, s2, 7 +; VI-NEXT: s_lshl_b32 s0, s0, 25 +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_or_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -400,10 +428,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 +; GFX9-NEXT: s_lshr_b32 s3, s3, 9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 23 +; GFX9-NEXT: s_lshr_b32 s2, s2, 7 +; GFX9-NEXT: s_lshl_b32 s0, s0, 25 +; GFX9-NEXT: s_or_b32 s1, s1, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -428,8 +460,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX10-NEXT: s_lshr_b32 s3, s3, 9 +; GFX10-NEXT: s_lshr_b32 s2, s2, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, 25 +; GFX10-NEXT: s_lshl_b32 s1, s1, 23 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -438,10 +476,16 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX11-NEXT: s_lshr_b32 s3, s3, 9 +; GFX11-NEXT: s_lshr_b32 s2, s2, 7 +; GFX11-NEXT: s_lshl_b32 s0, s0, 25 +; GFX11-NEXT: s_lshl_b32 s1, s1, 23 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm ; @@ -450,10 +494,16 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7 +; GFX12-NEXT: s_lshr_b32 s3, s3, 9 +; GFX12-NEXT: s_lshr_b32 s2, s2, 7 +; GFX12-NEXT: s_lshl_b32 s0, s0, 25 +; GFX12-NEXT: s_lshl_b32 s1, s1, 23 +; GFX12-NEXT: s_or_b32 s0, s0, s2 +; GFX12-NEXT: s_or_b32 s1, s1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX12-NEXT: s_endpgm entry: @@ -471,18 +521,30 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s13 +; SI-NEXT: s_lshl_b32 s11, s11, 1 +; SI-NEXT: s_lshr_b32 s15, s15, s3 +; SI-NEXT: s_not_b32 s3, s3 +; SI-NEXT: s_lshl_b32 s3, s11, s3 +; SI-NEXT: s_lshr_b32 s11, s14, s2 +; SI-NEXT: s_lshl_b32 s10, s10, 1 +; SI-NEXT: s_not_b32 s2, s2 +; SI-NEXT: s_lshl_b32 s2, s10, s2 +; SI-NEXT: s_lshr_b32 s10, s13, s1 +; SI-NEXT: s_lshl_b32 s9, s9, 1 +; SI-NEXT: s_not_b32 s1, s1 +; SI-NEXT: s_lshl_b32 s1, s9, s1 +; SI-NEXT: s_lshr_b32 s9, s12, s0 +; SI-NEXT: s_lshl_b32 s8, s8, 1 +; SI-NEXT: s_not_b32 s0, s0 +; SI-NEXT: s_lshl_b32 s0, s8, s0 +; SI-NEXT: s_or_b32 s3, s3, s15 +; SI-NEXT: s_or_b32 s2, s2, s11 +; SI-NEXT: s_or_b32 s1, s1, s10 +; SI-NEXT: s_or_b32 s0, s0, s9 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -492,19 +554,31 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4 +; VI-NEXT: s_lshl_b32 s7, s11, 1 +; VI-NEXT: s_lshr_b32 s6, s15, s3 +; VI-NEXT: s_not_b32 s3, s3 +; VI-NEXT: s_lshl_b32 s3, s7, s3 +; VI-NEXT: s_or_b32 s3, s3, s6 +; VI-NEXT: s_lshr_b32 s6, s14, s2 +; VI-NEXT: s_lshl_b32 s7, s10, 1 +; VI-NEXT: s_not_b32 s2, s2 +; VI-NEXT: s_lshl_b32 s2, s7, s2 +; VI-NEXT: s_or_b32 s2, s2, s6 +; VI-NEXT: s_lshr_b32 s6, s13, s1 +; VI-NEXT: s_lshl_b32 s7, s9, 1 +; VI-NEXT: s_not_b32 s1, s1 +; VI-NEXT: s_lshl_b32 s1, s7, s1 +; VI-NEXT: s_or_b32 s1, s1, s6 +; VI-NEXT: s_lshr_b32 s6, s12, s0 +; VI-NEXT: s_lshl_b32 s7, s8, 1 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_lshl_b32 s0, s7, s0 +; VI-NEXT: s_or_b32 s0, s0, s6 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -516,18 +590,30 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshl_b32 s5, s11, 1 +; GFX9-NEXT: s_lshr_b32 s4, s15, s3 +; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: s_lshl_b32 s3, s5, s3 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s14, s2 +; GFX9-NEXT: s_lshl_b32 s5, s10, 1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_lshl_b32 s2, s5, s2 +; GFX9-NEXT: s_or_b32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s13, s1 +; GFX9-NEXT: s_lshl_b32 s5, s9, 1 +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: s_lshl_b32 s1, s5, s1 +; GFX9-NEXT: s_or_b32 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s12, s0 +; GFX9-NEXT: s_lshl_b32 s5, s8, 1 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_lshl_b32 s0, s5, s0 +; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -552,101 +638,105 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-LABEL: fshr_v4i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX10-NEXT: s_lshl_b32 s5, s11, 1 +; GFX10-NEXT: s_lshr_b32 s4, s15, s3 +; GFX10-NEXT: s_not_b32 s3, s3 +; GFX10-NEXT: s_lshr_b32 s11, s14, s2 +; GFX10-NEXT: s_lshl_b32 s10, s10, 1 +; GFX10-NEXT: s_not_b32 s2, s2 +; GFX10-NEXT: s_lshr_b32 s13, s13, s1 +; GFX10-NEXT: s_lshl_b32 s9, s9, 1 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshr_b32 s12, s12, s0 +; GFX10-NEXT: s_lshl_b32 s8, s8, 1 +; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_lshl_b32 s3, s5, s3 +; GFX10-NEXT: s_lshl_b32 s2, s10, s2 +; GFX10-NEXT: s_lshl_b32 s1, s9, s1 +; GFX10-NEXT: s_lshl_b32 s0, s8, s0 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s11 +; GFX10-NEXT: s_or_b32 s0, s0, s12 +; GFX10-NEXT: s_or_b32 s1, s1, s13 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: fshr_v4i32: -; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: fshr_v4i32: -; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX11-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm -; -; GFX12-TRUE16-LABEL: fshr_v4i32: -; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v3, s11, s15, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v2, s10, s14, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s9, s13, v1.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s8, s12, v4.l -; GFX12-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[4:5] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: fshr_v4i32: -; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 -; GFX12-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v3, s11, s15, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v2, s10, s14, v1 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s9, s13, v4 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s8, s12, v5 -; GFX12-FAKE16-NEXT: global_store_b128 v6, v[0:3], s[4:5] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: fshr_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s7, s11, 1 +; GFX11-NEXT: s_lshr_b32 s6, s15, s3 +; GFX11-NEXT: s_not_b32 s3, s3 +; GFX11-NEXT: s_lshr_b32 s11, s14, s2 +; GFX11-NEXT: s_lshl_b32 s10, s10, 1 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_lshr_b32 s13, s13, s1 +; GFX11-NEXT: s_lshl_b32 s9, s9, 1 +; GFX11-NEXT: s_not_b32 s1, s1 +; GFX11-NEXT: s_lshr_b32 s12, s12, s0 +; GFX11-NEXT: s_lshl_b32 s8, s8, 1 +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_lshl_b32 s3, s7, s3 +; GFX11-NEXT: s_lshl_b32 s2, s10, s2 +; GFX11-NEXT: s_lshl_b32 s1, s9, s1 +; GFX11-NEXT: s_lshl_b32 s0, s8, s0 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s2, s2, s11 +; GFX11-NEXT: s_or_b32 s0, s0, s12 +; GFX11-NEXT: s_or_b32 s1, s1, s13 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: fshr_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b32 s7, s11, 1 +; GFX12-NEXT: s_lshr_b32 s6, s15, s3 +; GFX12-NEXT: s_not_b32 s3, s3 +; GFX12-NEXT: s_lshr_b32 s11, s14, s2 +; GFX12-NEXT: s_lshl_b32 s10, s10, 1 +; GFX12-NEXT: s_not_b32 s2, s2 +; GFX12-NEXT: s_lshr_b32 s13, s13, s1 +; GFX12-NEXT: s_lshl_b32 s9, s9, 1 +; GFX12-NEXT: s_not_b32 s1, s1 +; GFX12-NEXT: s_lshr_b32 s12, s12, s0 +; GFX12-NEXT: s_lshl_b32 s8, s8, 1 +; GFX12-NEXT: s_not_b32 s0, s0 +; GFX12-NEXT: s_lshl_b32 s3, s7, s3 +; GFX12-NEXT: s_lshl_b32 s2, s10, s2 +; GFX12-NEXT: s_lshl_b32 s1, s9, s1 +; GFX12-NEXT: s_lshl_b32 s0, s8, s0 +; GFX12-NEXT: s_or_b32 s3, s3, s6 +; GFX12-NEXT: s_or_b32 s2, s2, s11 +; GFX12-NEXT: s_or_b32 s0, s0, s12 +; GFX12-NEXT: s_or_b32 s1, s1, s13 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX12-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, ptr addrspace(1) %in @@ -661,14 +751,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s15, 1 +; SI-NEXT: s_lshl_b32 s5, s11, 31 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshr_b32 s5, s14, 9 +; SI-NEXT: s_lshl_b32 s6, s10, 23 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshr_b32 s6, s13, 7 +; SI-NEXT: s_lshl_b32 s7, s9, 25 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_lshr_b32 s7, s12, 1 +; SI-NEXT: s_lshl_b32 s8, s8, 31 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -677,15 +775,23 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s15 -; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: v_mov_b32_e32 v4, s13 -; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7 -; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: s_lshr_b32 s2, s15, 1 +; VI-NEXT: s_lshl_b32 s3, s11, 31 +; VI-NEXT: s_lshr_b32 s4, s14, 9 +; VI-NEXT: s_lshl_b32 s5, s10, 23 +; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_or_b32 s3, s5, s4 +; VI-NEXT: s_lshr_b32 s4, s13, 7 +; VI-NEXT: s_lshl_b32 s5, s9, 25 +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_lshr_b32 s5, s12, 1 +; VI-NEXT: s_lshl_b32 s6, s8, 31 +; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -696,14 +802,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 -; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 -; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 +; GFX9-NEXT: s_lshr_b32 s2, s15, 1 +; GFX9-NEXT: s_lshl_b32 s3, s11, 31 +; GFX9-NEXT: s_lshr_b32 s4, s14, 9 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s3, s10, 23 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s13, 7 +; GFX9-NEXT: s_lshl_b32 s5, s9, 25 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_lshr_b32 s5, s12, 1 +; GFX9-NEXT: s_lshl_b32 s6, s8, 31 +; GFX9-NEXT: s_or_b32 s5, s6, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -730,10 +844,22 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX10-NEXT: s_lshr_b32 s2, s15, 1 +; GFX10-NEXT: s_lshl_b32 s3, s11, 31 +; GFX10-NEXT: s_lshr_b32 s4, s14, 9 +; GFX10-NEXT: s_lshl_b32 s5, s10, 23 +; GFX10-NEXT: s_lshr_b32 s6, s13, 7 +; GFX10-NEXT: s_lshl_b32 s7, s9, 25 +; GFX10-NEXT: s_lshr_b32 s9, s12, 1 +; GFX10-NEXT: s_lshl_b32 s8, s8, 31 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s3, s5, s4 +; GFX10-NEXT: s_or_b32 s4, s8, s9 +; GFX10-NEXT: s_or_b32 s5, s7, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -742,12 +868,23 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX11-NEXT: s_lshr_b32 s2, s15, 1 +; GFX11-NEXT: s_lshl_b32 s3, s11, 31 +; GFX11-NEXT: s_lshr_b32 s4, s14, 9 +; GFX11-NEXT: s_lshl_b32 s5, s10, 23 +; GFX11-NEXT: s_lshr_b32 s6, s13, 7 +; GFX11-NEXT: s_lshl_b32 s7, s9, 25 +; GFX11-NEXT: s_lshr_b32 s9, s12, 1 +; GFX11-NEXT: s_lshl_b32 s8, s8, 31 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s3, s5, s4 +; GFX11-NEXT: s_or_b32 s4, s8, s9 +; GFX11-NEXT: s_or_b32 s5, s7, s6 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -756,12 +893,23 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v3, s11, s15, 1 -; GFX12-NEXT: v_alignbit_b32 v2, s10, s14, 9 -; GFX12-NEXT: v_alignbit_b32 v1, s9, s13, 7 -; GFX12-NEXT: v_alignbit_b32 v0, s8, s12, 1 +; GFX12-NEXT: s_lshr_b32 s2, s15, 1 +; GFX12-NEXT: s_lshl_b32 s3, s11, 31 +; GFX12-NEXT: s_lshr_b32 s4, s14, 9 +; GFX12-NEXT: s_lshl_b32 s5, s10, 23 +; GFX12-NEXT: s_lshr_b32 s6, s13, 7 +; GFX12-NEXT: s_lshl_b32 s7, s9, 25 +; GFX12-NEXT: s_lshr_b32 s9, s12, 1 +; GFX12-NEXT: s_lshl_b32 s8, s8, 31 +; GFX12-NEXT: s_or_b32 s2, s3, s2 +; GFX12-NEXT: s_or_b32 s3, s5, s4 +; GFX12-NEXT: s_or_b32 s4, s8, s9 +; GFX12-NEXT: s_or_b32 s5, s7, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -1022,9 +1170,11 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) { ; SI-LABEL: v_fshr_i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 15, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_i16: @@ -1121,16 +1271,19 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2 ; SI-LABEL: v_fshr_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v2, 15, v4 +; SI-NEXT: v_bfe_u32 v0, v0, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 15, v5 +; SI-NEXT: v_lshrrev_b32_e32 v3, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_bfe_u32 v1, v1, v2, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i16: @@ -1219,20 +1372,25 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; SI-LABEL: v_fshr_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7 -; SI-NEXT: v_or_b32_e32 v4, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 -; SI-NEXT: v_or_b32_e32 v3, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_and_b32_e32 v3, 15, v6 +; SI-NEXT: v_bfe_u32 v0, v0, v3, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 15, v7 +; SI-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 15, v8 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, v3, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; SI-NEXT: v_bfe_u32 v2, v2, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v4, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v3i16: @@ -1422,26 +1580,32 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; SI-LABEL: v_fshr_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9 -; SI-NEXT: v_or_b32_e32 v5, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5 -; SI-NEXT: v_or_b32_e32 v4, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 -; SI-NEXT: v_or_b32_e32 v5, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_and_b32_e32 v4, 15, v8 +; SI-NEXT: v_bfe_u32 v0, v0, v4, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_and_b32_e32 v4, 15, v9 +; SI-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 15, v10 +; SI-NEXT: v_bfe_u32 v2, v2, v4, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 15, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_bfe_u32 v3, v3, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v4i16: @@ -1882,9 +2046,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { ; SI-LABEL: v_fshr_i24: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaab -; SI-NEXT: v_mul_hi_u32 v3, v3, s4 +; SI-NEXT: v_mul_hi_u32 v3, v2, s4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -1895,9 +2059,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { ; VI-LABEL: v_fshr_i24: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaab -; VI-NEXT: v_mul_hi_u32 v3, v3, s4 +; VI-NEXT: v_mul_hi_u32 v3, v2, s4 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 @@ -1908,9 +2072,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { ; GFX9-LABEL: v_fshr_i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab -; GFX9-NEXT: v_mul_hi_u32 v3, v3, s4 +; GFX9-NEXT: v_mul_hi_u32 v3, v2, s4 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 @@ -1926,9 +2090,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { ; GFX10-LABEL: v_fshr_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 +; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2 ; GFX10-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 @@ -1938,25 +2102,26 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { ; GFX11-TRUE16-LABEL: v_fshr_i24: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 -; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2 +; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 8, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v1, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v3, v1.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_fshr_i24: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 +; GFX11-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2 ; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3 @@ -1972,16 +2137,17 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 -; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2 +; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v1 ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v2, 8, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v1, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v3, v1.l ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_fshr_i24: @@ -1991,10 +2157,10 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { ; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 +; GFX12-FAKE16-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v2 ; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v2, v2, v3 @@ -2096,9 +2262,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 ; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2107,12 +2271,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 ; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v6, v2.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v7, v3.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_fshr_v2i24: @@ -2148,9 +2317,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 ; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -2159,12 +2326,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 ; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 ; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v6, v2.l +; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v7, v3.l ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_fshr_v2i24: diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index f8ff8efbb1ef1..0a18a393cb53a 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -4773,9 +4773,10 @@ define void @void_func_v2bf16(<2 x bfloat> %arg0) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -4807,9 +4808,10 @@ define void @void_func_v3bf16(<3 x bfloat> %arg0) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: s_mov_b32 s7, 0xf000 @@ -4847,13 +4849,15 @@ define void @void_func_v4bf16(<4 x bfloat> %arg0) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; CI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_or_b32_e32 v1, v0, v1 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 @@ -4885,21 +4889,25 @@ define void @void_func_v8bf16(<8 x bfloat> %arg0) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; CI-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: v_or_b32_e32 v5, v4, v5 +; CI-NEXT: v_or_b32_e32 v4, v2, v3 +; CI-NEXT: v_or_b32_e32 v3, v0, v1 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 @@ -4931,39 +4939,47 @@ define void @void_func_v16bf16(<16 x bfloat> %arg0) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; CI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; CI-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; CI-NEXT: v_alignbit_b32 v3, v1, v0, 16 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v5, v4, v5 +; CI-NEXT: v_or_b32_e32 v4, v2, v3 +; CI-NEXT: v_or_b32_e32 v3, v0, v1 +; CI-NEXT: v_mul_f32_e32 v0, 1.0, v15 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; CI-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v14, v1, v0 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v13 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; CI-NEXT: v_alignbit_b32 v13, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v13, v1, v0 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v10 -; CI-NEXT: v_alignbit_b32 v12, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v12, v1, v0 ; CI-NEXT: v_mul_f32_e32 v0, 1.0, v9 -; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_mul_f32_e32 v1, 1.0, v8 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; CI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; CI-NEXT: v_alignbit_b32 v11, v0, v1, 16 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_or_b32_e32 v11, v1, v0 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 ; CI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 ; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 1f74fbdc46e98..86ec634e029ff 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -20632,30 +20632,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -20686,16 +20686,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -20731,17 +20732,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -21013,30 +21015,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -21066,16 +21068,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -21111,17 +21114,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -21394,30 +21398,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -21451,16 +21455,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -21498,17 +21503,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v1 ; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -21770,29 +21776,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -21823,16 +21829,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -21866,17 +21873,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -22139,29 +22147,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -22192,16 +22200,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -22235,17 +22244,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -22509,29 +22519,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -22566,16 +22576,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -22613,17 +22624,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -22897,30 +22909,30 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -22950,16 +22962,17 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -22995,17 +23008,18 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -23274,29 +23288,29 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -23327,16 +23341,17 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -23370,17 +23385,18 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -23649,30 +23665,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -23703,16 +23719,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -23748,17 +23765,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -24021,29 +24039,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -24074,16 +24092,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -24117,17 +24136,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -24395,30 +24415,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -24449,16 +24469,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -24494,17 +24515,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -24767,29 +24789,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -24820,16 +24842,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -24863,17 +24886,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -25141,30 +25165,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -25195,16 +25219,17 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -25240,17 +25265,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -25513,29 +25539,29 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -25566,16 +25592,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -25609,17 +25636,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index faa74fef2be2f..32d0d679d85cc 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -16338,30 +16338,30 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16382,26 +16382,27 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -16427,27 +16428,28 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -16844,30 +16846,30 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16887,26 +16889,27 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -16932,27 +16935,28 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -17350,30 +17354,30 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17394,8 +17398,8 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc @@ -17407,16 +17411,17 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -17441,8 +17446,8 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc @@ -17454,17 +17459,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v1 ; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -17847,29 +17853,29 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17889,36 +17895,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17932,37 +17939,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18337,29 +18345,29 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18379,36 +18387,37 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18422,37 +18431,38 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18828,29 +18838,29 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18870,40 +18880,41 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18917,41 +18928,42 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19342,30 +19354,30 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19385,26 +19397,27 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -19430,27 +19443,28 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -19841,29 +19855,29 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19883,36 +19897,37 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19926,37 +19941,38 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_max_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_max_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index a46b0129b79e6..4065a833a89f5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -16338,30 +16338,30 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16382,26 +16382,27 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -16427,27 +16428,28 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -16844,30 +16846,30 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16887,26 +16889,27 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -16932,27 +16935,28 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -17350,30 +17354,30 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17394,8 +17398,8 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc @@ -17407,16 +17411,17 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -17441,8 +17446,8 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc @@ -17454,17 +17459,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v1 ; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -17847,29 +17853,29 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17889,36 +17895,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB57_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17932,37 +17939,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB57_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18337,29 +18345,29 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18379,36 +18387,37 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB58_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18422,37 +18431,38 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB58_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18828,29 +18838,29 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18870,40 +18880,41 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: s_mov_b32 s5, -1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB59_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18917,41 +18928,42 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX6-NEXT: s_mov_b32 s5, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB59_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19342,30 +19354,30 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19385,26 +19397,27 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -19430,27 +19443,28 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 ; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -19841,29 +19855,29 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19883,36 +19897,37 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX7-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_cbranch_execnz .LBB61_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -19926,37 +19941,38 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[8:9], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_min_f32_e32 v7, v7, v3 -; GFX6-NEXT: v_alignbit_b32 v5, v5, v4, 16 +; GFX6-NEXT: v_min_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_cbranch_execnz .LBB61_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 053efdcb76261..8da0f9e68c718 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -16699,30 +16699,30 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -16753,16 +16753,17 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -16798,17 +16799,18 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -17205,30 +17207,30 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17258,16 +17260,17 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -17303,17 +17306,18 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -17711,30 +17715,30 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -17768,16 +17772,17 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, v0 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -17815,17 +17820,18 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6 ; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v1 ; GFX6-NEXT: v_mov_b32_e32 v6, v0 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc @@ -18208,29 +18214,29 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18261,16 +18267,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -18304,17 +18311,18 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -18698,29 +18706,29 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -18751,16 +18759,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -18794,17 +18803,18 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -19189,29 +19199,29 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19246,16 +19256,17 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -19293,17 +19304,18 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc @@ -19703,30 +19715,30 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; GFX8-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -19756,16 +19768,17 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v3 ; GFX7-NEXT: v_mov_b32_e32 v6, v2 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -19801,17 +19814,18 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 ; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v3 ; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -20202,29 +20216,29 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2 ; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_f32_e32 v6, v6, v5 ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 @@ -20255,16 +20269,17 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_mov_b32_e32 v7, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, v4 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc @@ -20298,17 +20313,18 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_sub_f32_e32 v7, v7, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 -; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX6-NEXT: v_sub_f32_e32 v7, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, v5 ; GFX6-NEXT: v_mov_b32_e32 v6, v4 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 7ebd69204d87f..ab2618863da2a 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2253,40 +2253,36 @@ entry: define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc16_vecMul: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index c947d6976a95f..86bed9c73c1d3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -238,11 +238,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -256,11 +257,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s0, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s0, s1, s0 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -312,13 +314,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s1, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -334,13 +336,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s1, s0, s1 +; CI-NEXT: v_mov_b32_e32 v2, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART ; CI-NEXT: ; use s0 @@ -405,13 +407,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s2, s0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART ; VI-NEXT: ; use s0 @@ -430,19 +433,20 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s1, s0, s1 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_lshr_b32 s2, s2, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART ; CI-NEXT: ; use s0 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s2 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; @@ -825,10 +829,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v2, s4, 16 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; CI-NEXT: v_or_b32_e32 v2, s0, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll index e1f84dcbaa607..234c8f229ff34 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -40,9 +40,10 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf ; GFX7-LABEL: buffer_store_v2bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; @@ -73,13 +74,15 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf ; GFX7-LABEL: buffer_store_v4bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v0, v1 ; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; @@ -116,21 +119,25 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf ; GFX7-LABEL: buffer_store_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 -; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v1 ; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index de1f859132e61..c71e7cbddaaf1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -421,9 +421,10 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf ; VERDE-LABEL: buffer_store_v2bf16: ; VERDE: ; %bb.0: ; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; VERDE-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VERDE-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VERDE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 ; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; @@ -439,13 +440,15 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf ; VERDE-LABEL: buffer_store_v4bf16: ; VERDE: ; %bb.0: ; VERDE-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; VERDE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VERDE-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; VERDE-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; VERDE-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VERDE-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VERDE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VERDE-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VERDE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 ; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen ; VERDE-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index b1bdfa667f57e..6e564e39b8cd4 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -9771,19 +9771,20 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX6-NOHSA: ; %bb.0: ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s2, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff +; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s6, s5 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s2, s4 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -9799,14 +9800,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 8 +; GFX7-HSA-NEXT: s_bfe_u32 s3, s2, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; GFX7-HSA-NEXT: s_or_b32 s1, s3, s1 ; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -9819,14 +9821,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s2, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v2, s0, v2, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s3, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10001,26 +10004,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s5, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s5, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s4, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s4, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 ; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GFX6-NOHSA-NEXT: s_and_b32 s9, s9, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s8, s7 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s6 +; GFX6-NOHSA-NEXT: s_or_b32 s6, s10, s9 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s2 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10032,27 +10037,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: s_and_b32 s1, s3, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 8 +; GFX7-HSA-NEXT: s_bfe_u32 s5, s3, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s1, s3, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-HSA-NEXT: s_or_b32 s4, s5, s4 +; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s5, s2, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4 -; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_or_b32 s3, s5, s3 +; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -10064,25 +10071,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s5, s3, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s3, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s3 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s5, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s1 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s5, s3 -; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NOHSA-NEXT: s_or_b32 s2, s4, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -10322,42 +10330,47 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NOHSA-NEXT: s_and_b32 s9, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s11, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s5, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s4, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s7, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s17, s7, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s19, s6, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s12, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s11, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s14, 0xff0000 ; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s18, 0xff0000 ; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10 +; GFX6-NOHSA-NEXT: s_or_b32 s12, s13, s12 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s11 +; GFX6-NOHSA-NEXT: s_or_b32 s11, s15, s14 +; GFX6-NOHSA-NEXT: s_or_b32 s13, s17, s16 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s9 +; GFX6-NOHSA-NEXT: s_or_b32 s9, s19, s18 ; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10365,53 +10378,57 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24 -; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s5, 8 +; GFX7-HSA-NEXT: s_bfe_u32 s11, s5, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 +; GFX7-HSA-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s10, s10, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s5, s5, s9 +; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 8 +; GFX7-HSA-NEXT: s_and_b32 s3, s7, 0xff00 +; GFX7-HSA-NEXT: s_or_b32 s10, s11, s10 +; GFX7-HSA-NEXT: s_and_b32 s9, s9, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s11, s4, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff +; GFX7-HSA-NEXT: s_or_b32 s9, s11, s9 +; GFX7-HSA-NEXT: s_or_b32 s4, s4, s8 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 8 +; GFX7-HSA-NEXT: s_bfe_u32 s11, s7, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s8, s8, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s3, s7, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 8 +; GFX7-HSA-NEXT: s_or_b32 s8, s11, s8 +; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s11, s6, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12 -; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10 -; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8 -; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: s_or_b32 s7, s11, s7 +; GFX7-HSA-NEXT: s_or_b32 s2, s6, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -10422,50 +10439,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000 -; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010 -; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s4, s3, s4 -; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NOHSA-NEXT: s_or_b32 s3, s9, s3 -; GFX8-NOHSA-NEXT: s_and_b32 s9, s7, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s5, 0x80010 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s3, s2 +; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s4, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s4, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s4, s2, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NOHSA-NEXT: s_and_b32 s3, s7, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16 -; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff -; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 +; GFX8-NOHSA-NEXT: s_and_b32 s8, s5, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5 +; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 +; GFX8-NOHSA-NEXT: s_lshl_b32 s9, s9, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s3, s3, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s6, 24 +; GFX8-NOHSA-NEXT: s_or_b32 s5, s8, s5 +; GFX8-NOHSA-NEXT: s_or_b32 s8, s10, s9 +; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s6, 0x80010 ; GFX8-NOHSA-NEXT: s_or_b32 s7, s9, s7 -; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: s_and_b32 s9, s6, 0xff +; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s9, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -10850,80 +10869,91 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NOHSA-NEXT: s_and_b32 s13, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s15, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s17, s3, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s0, 0xff00 +; GFX6-NOHSA-NEXT: s_and_b32 s19, s1, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s1, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s21, s1, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s0, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s3, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s2, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s5, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s4, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s7, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s7, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6 +; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s6, 8 +; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s6, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16 -; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s20, s20, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s19, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s22, s22, 0xff0000 ; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s24, s24, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s26, s26, 0xff0000 ; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s28, s28, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s15, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s30, s30, 0xff0000 ; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16 +; GFX6-NOHSA-NEXT: s_and_b32 s33, s33, 0xff0000 +; GFX6-NOHSA-NEXT: s_lshl_b32 s13, s13, 8 +; GFX6-NOHSA-NEXT: s_and_b32 s35, s35, 0xff0000 ; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14 +; GFX6-NOHSA-NEXT: s_or_b32 s20, s21, s20 +; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s19 +; GFX6-NOHSA-NEXT: s_or_b32 s19, s23, s22 +; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s18 +; GFX6-NOHSA-NEXT: s_or_b32 s18, s25, s24 +; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s17 +; GFX6-NOHSA-NEXT: s_or_b32 s17, s27, s26 +; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s16 +; GFX6-NOHSA-NEXT: s_or_b32 s16, s29, s28 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s15 +; GFX6-NOHSA-NEXT: s_or_b32 s15, s31, s30 +; GFX6-NOHSA-NEXT: s_or_b32 s21, s34, s33 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s13 +; GFX6-NOHSA-NEXT: s_or_b32 s13, s36, s35 ; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12 -; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -10936,94 +10966,102 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24 -; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00 -; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s16, s1, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b32 s17, s1, 8 +; GFX7-HSA-NEXT: s_bfe_u32 s18, s1, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24 -; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24 +; GFX7-HSA-NEXT: s_lshl_b32 s16, s16, 8 +; GFX7-HSA-NEXT: s_and_b32 s17, s17, 0xff0000 +; GFX7-HSA-NEXT: s_or_b32 s16, s1, s16 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-HSA-NEXT: s_and_b32 s15, s0, 0xff00 +; GFX7-HSA-NEXT: s_or_b32 s17, s18, s17 +; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s18, s0, 0x80010 +; GFX7-HSA-NEXT: s_or_b32 s18, s18, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8 +; GFX7-HSA-NEXT: s_or_b32 s15, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s3, 8 +; GFX7-HSA-NEXT: s_and_b32 s14, s3, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX7-HSA-NEXT: s_or_b32 s19, s1, s0 ; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24 -; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 ; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-HSA-NEXT: s_and_b32 s13, s2, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s1, s2, 0x80010 +; GFX7-HSA-NEXT: s_or_b32 s14, s1, s0 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8 -; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s13, 8 ; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s5, 8 +; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s1, s5, 0x80010 +; GFX7-HSA-NEXT: s_or_b32 s13, s1, s0 ; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 ; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 -; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s4, 8 +; GFX7-HSA-NEXT: s_and_b32 s11, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s1, s4, 0x80010 +; GFX7-HSA-NEXT: s_or_b32 s12, s1, s0 ; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s11, 8 ; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 -; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24 -; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0 -; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8 -; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 +; GFX7-HSA-NEXT: s_lshr_b32 s0, s7, 8 +; GFX7-HSA-NEXT: s_and_b32 s10, s7, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s1, s7, 0x80010 +; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 +; GFX7-HSA-NEXT: s_and_b32 s1, s7, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s7, s10, 8 +; GFX7-HSA-NEXT: s_or_b32 s1, s1, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 8 +; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX7-HSA-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-HSA-NEXT: s_or_b32 s7, s10, s7 +; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-HSA-NEXT: s_or_b32 s6, s6, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -11034,51 +11072,53 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s16, s1, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s1, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s12, s1, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16 +; GFX8-NOHSA-NEXT: s_lshl_b32 s10, s10, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24 -; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14 -; GFX8-NOHSA-NEXT: s_or_b32 s15, s16, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s0, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s10, s11, s10 +; GFX8-NOHSA-NEXT: s_or_b32 s11, s12, s1 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NOHSA-NEXT: s_lshl_b32 s13, s13, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX8-NOHSA-NEXT: s_or_b32 s12, s14, s13 ; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s14, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s15, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s2, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_or_b32 s2, s0, s1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s12, v0, 16 -; GFX8-NOHSA-NEXT: s_or_b32 s12, s1, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1 +; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s4, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s17, s1, s0 ; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s4, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff ; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: s_or_b32 s4, s0, s1 ; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24 ; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16 @@ -11086,46 +11126,48 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 ; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 ; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_and_b32 s7, s6, 0xff +; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s6, 24 +; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 16 +; GFX8-NOHSA-NEXT: s_bfe_u32 s18, s6, 0x80010 +; GFX8-NOHSA-NEXT: s_or_b32 s7, s18, s7 +; GFX8-NOHSA-NEXT: s_and_b32 s18, s6, 0xff ; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 ; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000 -; GFX8-NOHSA-NEXT: s_or_b32 s6, s7, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NOHSA-NEXT: s_or_b32 s6, s18, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index 5bc02c4d63181..b4029d57523ba 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -9830,12 +9830,13 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v3, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff0000, v2 +; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v0, v4 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -9853,12 +9854,13 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GCN-HSA-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff0000, v4 +; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v3 +; GCN-HSA-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-HSA-NEXT: v_or_b32_e32 v2, v2, v6 ; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -9878,9 +9880,9 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xff0000, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -10063,33 +10065,41 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s0, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s1, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s1, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s9, s1, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s0, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s9, s8 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s1, s7 +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s11, s10 +; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s0, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16: @@ -10105,20 +10115,28 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v7, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v8, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v9, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-HSA-NEXT: s_and_b32 s2, s0, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s3, s1, 0xff00 +; GCN-HSA-NEXT: s_lshr_b32 s4, s1, 8 +; GCN-HSA-NEXT: s_lshr_b32 s6, s0, 8 +; GCN-HSA-NEXT: s_bfe_u32 s5, s1, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xff +; GCN-HSA-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s3, s3, 8 +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-HSA-NEXT: s_or_b32 s4, s5, s4 +; GCN-HSA-NEXT: s_or_b32 s1, s1, s3 +; GCN-HSA-NEXT: s_or_b32 s3, s7, s6 +; GCN-HSA-NEXT: s_or_b32 s0, s0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -10136,23 +10154,28 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s6, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s4, 0xff +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v2, v2, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s6, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s7, s4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v2 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -10412,35 +10435,52 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v12, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v13, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v14, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v15, v4 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s5, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s7, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s7, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s13, s7, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s15, s6, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s5, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s17, s5, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s19, s4, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s11, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-SI-NEXT: s_or_b32 s12, s13, s12 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s7, s11 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s15, s14 +; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s17, s16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s5, s9 +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s19, s18 +; GCN-NOHSA-SI-NEXT: s_or_b32 s4, s4, s8 +; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s6, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -10454,43 +10494,59 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s7, s3, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GCN-HSA-NEXT: s_lshr_b32 s10, s5, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 8 +; GCN-HSA-NEXT: s_lshr_b32 s14, s3, 8 +; GCN-HSA-NEXT: s_lshr_b32 s16, s2, 8 +; GCN-HSA-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xff +; GCN-HSA-NEXT: s_bfe_u32 s13, s4, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff +; GCN-HSA-NEXT: s_bfe_u32 s15, s3, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xff +; GCN-HSA-NEXT: s_bfe_u32 s17, s2, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-HSA-NEXT: s_or_b32 s10, s11, s10 +; GCN-HSA-NEXT: s_or_b32 s5, s5, s9 +; GCN-HSA-NEXT: s_or_b32 s9, s13, s12 +; GCN-HSA-NEXT: s_or_b32 s4, s4, s8 +; GCN-HSA-NEXT: s_or_b32 s8, s15, s14 +; GCN-HSA-NEXT: s_or_b32 s3, s3, s7 +; GCN-HSA-NEXT: s_or_b32 s7, s17, s16 +; GCN-HSA-NEXT: s_or_b32 s2, s2, s6 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16: @@ -10507,42 +10563,52 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v1 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff0000, v5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11 +; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5 +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17 +; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v16i8_to_v16i16: @@ -10928,71 +10994,105 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s4, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s5, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s6, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s7, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s8, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s9, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s10, 0xff00 +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s11, 0xff00 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s11, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s21, s11, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s23, s10, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s25, s9, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s27, s8, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s29, s7, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s31, s6, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s5, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s34, s5, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s4, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_u32 s36, s4, 0x80010 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff +; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s20, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s19, s19, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s18, s18, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s24, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s17, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s26, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s16, s16, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s15, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s30, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s33, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s13, s13, 8 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s35, 0xff0000 +; GCN-NOHSA-SI-NEXT: s_lshl_b32 s12, s12, 8 +; GCN-NOHSA-SI-NEXT: s_or_b32 s20, s21, s20 +; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s11, s19 +; GCN-NOHSA-SI-NEXT: s_or_b32 s19, s23, s22 +; GCN-NOHSA-SI-NEXT: s_or_b32 s10, s10, s18 +; GCN-NOHSA-SI-NEXT: s_or_b32 s18, s25, s24 +; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s9, s17 +; GCN-NOHSA-SI-NEXT: s_or_b32 s17, s27, s26 +; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s8, s16 +; GCN-NOHSA-SI-NEXT: s_or_b32 s16, s29, s28 +; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s7, s15 +; GCN-NOHSA-SI-NEXT: s_or_b32 s15, s31, s30 +; GCN-NOHSA-SI-NEXT: s_or_b32 s21, s34, s33 +; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s5, s13 +; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s36, s35 +; GCN-NOHSA-SI-NEXT: s_or_b32 s4, s4, s12 +; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s6, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0 -; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3 -; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -11005,88 +11105,120 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s10, s6, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s11, s7, 0xff00 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 8 +; GCN-HSA-NEXT: s_bfe_u32 s3, s7, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s11, s11, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 8 +; GCN-HSA-NEXT: s_bfe_u32 s13, s6, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-HSA-NEXT: s_lshr_b32 s14, s5, 8 +; GCN-HSA-NEXT: s_bfe_u32 s15, s5, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 +; GCN-HSA-NEXT: s_bfe_u32 s17, s4, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff +; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff0000 +; GCN-HSA-NEXT: s_or_b32 s7, s7, s11 +; GCN-HSA-NEXT: s_and_b32 s11, s12, 0xff0000 +; GCN-HSA-NEXT: s_or_b32 s6, s6, s10 +; GCN-HSA-NEXT: s_and_b32 s10, s14, 0xff0000 +; GCN-HSA-NEXT: s_or_b32 s5, s5, s9 +; GCN-HSA-NEXT: s_and_b32 s9, s16, 0xff0000 +; GCN-HSA-NEXT: s_or_b32 s4, s4, s8 +; GCN-HSA-NEXT: s_or_b32 s2, s3, s2 +; GCN-HSA-NEXT: s_or_b32 s3, s13, s11 +; GCN-HSA-NEXT: s_or_b32 s8, s15, s10 +; GCN-HSA-NEXT: s_or_b32 s9, s17, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s7, s3, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s8, s4, 0xff00 +; GCN-HSA-NEXT: s_and_b32 s9, s5, 0xff00 +; GCN-HSA-NEXT: s_lshr_b32 s10, s5, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 8 +; GCN-HSA-NEXT: s_lshr_b32 s14, s3, 8 +; GCN-HSA-NEXT: s_lshr_b32 s16, s2, 8 +; GCN-HSA-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xff +; GCN-HSA-NEXT: s_bfe_u32 s13, s4, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff +; GCN-HSA-NEXT: s_bfe_u32 s15, s3, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xff +; GCN-HSA-NEXT: s_bfe_u32 s17, s2, 0x80010 +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xff0000 +; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-HSA-NEXT: s_or_b32 s10, s11, s10 +; GCN-HSA-NEXT: s_or_b32 s5, s5, s9 +; GCN-HSA-NEXT: s_or_b32 s9, s13, s12 +; GCN-HSA-NEXT: s_or_b32 s4, s4, s8 +; GCN-HSA-NEXT: s_or_b32 s8, s15, s14 +; GCN-HSA-NEXT: s_or_b32 s3, s3, s7 +; GCN-HSA-NEXT: s_or_b32 s7, s17, s16 +; GCN-HSA-NEXT: s_or_b32 s6, s2, s6 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9] -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2 -; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13 -; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16 -; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1 -; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 -; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9 -; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12 -; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16: @@ -11104,79 +11236,100 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s11, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s13, s11, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s11, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s9, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s31, s5, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s5, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s4, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s35, s4, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s4, 0xff ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s10, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s16, s10, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s10, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s19, s9, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s9, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s8, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s22, s8, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s8, 0xff +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s7, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s25, s7, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s7, 0xff ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s6, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_u32 s28, s6, 0x80010 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s6, 0xff ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s18, s18, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s30, s30, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s34, s34, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6 -; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s21, s21, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s24, s24, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s27, s27, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8 -; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11 -; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9 -; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6 -; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3 -; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s14, s11 +; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s19, s18 +; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s31, s30 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s33, s5 +; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s35, s34 +; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s36, s4 +; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s13, s12 +; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s16, s15 +; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s17, s10 +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s20, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s22, s21 +; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s23, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s25, s24 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s26, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s28, s27 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s29, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i8_to_v32i16: diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index a42c71c4849bd..715a22dbf6653 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7054,30 +7054,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7105,16 +7105,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7144,16 +7145,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7411,30 +7413,30 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7462,16 +7464,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7502,16 +7505,17 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 @@ -7761,29 +7765,29 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -7810,16 +7814,17 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -7847,16 +7852,17 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8105,29 +8111,29 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8154,16 +8160,17 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_add_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8192,16 +8199,17 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: v_add_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 8351d28057564..b3ed754112daa 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -6987,30 +6987,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7028,26 +7028,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7067,26 +7068,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v3, v0 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX6-NEXT: v_max_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7469,30 +7471,30 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_max_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7510,26 +7512,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX7-NEXT: v_max_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7550,26 +7553,27 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_max_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 @@ -7938,29 +7942,29 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -7976,33 +7980,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v4, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8013,33 +8018,34 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8401,29 +8407,29 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8439,33 +8445,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_max_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8477,33 +8484,34 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_max_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_max_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 0c4aca88b3781..a48b8f5c66b2e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -6987,30 +6987,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7028,26 +7028,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7067,26 +7068,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v3, v0 ; GFX6-NEXT: v_mov_b32_e32 v4, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX6-NEXT: v_min_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7469,30 +7471,30 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_min_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7510,26 +7512,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX7-NEXT: v_mov_b32_e32 v4, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX7-NEXT: v_min_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7550,26 +7553,27 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v0, v4 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX6-NEXT: v_min_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 @@ -7938,29 +7942,29 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -7976,33 +7980,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v4, v0 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB26_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8013,33 +8018,34 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB26_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8401,29 +8407,29 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_min_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8439,33 +8445,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: ds_read_b32 v3, v0 offset:65532 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: ds_read_b32 v4, v0 offset:65532 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_min_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 offset:65532 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_cbranch_execnz .LBB27_1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8477,33 +8484,34 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffc, v0 ; GFX6-NEXT: s_mov_b32 m0, -1 -; GFX6-NEXT: ds_read_b32 v3, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: ds_read_b32 v4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GFX6-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX6-NEXT: v_min_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16 -; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v0, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_cbranch_execnz .LBB27_1 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 37310b614c0db..8c1303d98f802 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -7759,30 +7759,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -7810,16 +7810,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_sub_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -7849,16 +7850,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX6-NEXT: v_sub_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -8241,30 +8243,30 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v2, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_sub_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 @@ -8292,16 +8294,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4 -; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: v_sub_f32_e32 v6, v1, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 @@ -8332,16 +8335,17 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v3 -; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v5, 16 +; GFX6-NEXT: v_sub_f32_e32 v6, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v5, v4, v0, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v0 @@ -8710,29 +8714,29 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8759,16 +8763,17 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_sub_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -8796,16 +8801,17 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: v_sub_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -9173,29 +9179,29 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_read_b32 v3, v0 offset:65532 ; GFX8-NEXT: s_mov_b64 s[6:7], 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3 ; GFX8-NEXT: v_sub_f32_e32 v4, v4, v2 ; GFX8-NEXT: v_sub_f32_e32 v5, v5, v1 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX8-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -9222,16 +9228,17 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: v_sub_f32_e32 v6, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 offset:65532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 @@ -9260,16 +9267,17 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_sub_f32_e32 v6, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_sub_f32_e32 v5, v5, v1 -; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX6-NEXT: v_sub_f32_e32 v6, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX6-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll index aab035f811434..645641d009a45 100644 --- a/llvm/test/CodeGen/AMDGPU/packetizer.ll +++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s ; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s @@ -32,3 +33,5 @@ entry: store i32 %xyzw, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll index cac983a3acfb3..ee43d46a61917 100644 --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -118,14 +118,14 @@ define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %a ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007 +; GCN-NEXT: v_mov_b32_e32 v3, 0x6050403 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, s2, v2, v3 +; GCN-NEXT: v_perm_b32 v2, v2, s2, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index a4ddfee115fa6..4d98c8bb54902 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -8,7 +8,10 @@ define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: global_store_dword v[4:5], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -16,9 +19,13 @@ define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[2:3], off -; GFX9-NEXT: s_mov_b32 s4, 0x6060706 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -327,7 +334,9 @@ define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -335,9 +344,10 @@ define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x4070706 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -405,7 +415,8 @@ define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -413,9 +424,10 @@ define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060706 +; GFX9-NEXT: s_mov_b32 s4, 0xffff0000 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -457,7 +469,8 @@ define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -465,9 +478,10 @@ define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060706 +; GFX9-NEXT: s_mov_b32 s4, 0xffff0000 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1242,10 +1256,10 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706 -; GFX10-NEXT: global_store_dword v[5:6], v0, off -; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ive_store_div: @@ -1261,7 +1275,6 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v10, v[2:3], off ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: s_mov_b32 s5, 0x2000706 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1270,9 +1283,9 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5 +; GFX9-NEXT: v_or_b32_sdwa v2, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v3, off +; GFX9-NEXT: global_store_dword v[7:8], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1507,64 +1520,68 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 -; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 -; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 -; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 -; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 -; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_trunc_f32_e32 v15, v15 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v11, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v11 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v20, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13 +; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v15 +; GFX10-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX10-NEXT: v_mul_f32_e32 v16, v3, v16 +; GFX10-NEXT: v_mul_f32_e32 v17, v20, v17 +; GFX10-NEXT: v_ashrrev_i32_e32 v10, 30, v10 +; GFX10-NEXT: v_mul_f32_e32 v18, v3, v18 +; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_trunc_f32_e32 v16, v16 -; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 -; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2 -; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 -; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_mul_f32_e32 v19, v2, v19 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18 -; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| -; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 -; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 -; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 -; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| -; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 +; GFX10-NEXT: v_mad_f32 v21, -v16, v2, v3 +; GFX10-NEXT: v_mad_f32 v20, -v17, v11, v20 +; GFX10-NEXT: v_or_b32_e32 v10, 1, v10 +; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_mad_f32 v3, -v18, v13, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v2| +; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 +; GFX10-NEXT: v_mad_f32 v22, -v19, v15, v2 ; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v11| +; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12| -; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0 -; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo -; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 -; GFX10-NEXT: global_store_dword v[5:6], v0, off -; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v3|, |v13| +; GFX10-NEXT: v_add_nc_u32_e32 v1, v16, v1 +; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v4, 8, v4 +; GFX10-NEXT: v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v12, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v15| +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_e32 v3, v18, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_sdwa v10, v19, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: sdiv_store_div: @@ -1576,64 +1593,67 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x60706 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 -; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 -; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 -; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16 -; GFX9-NEXT: v_trunc_f32_e32 v15, v15 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 -; GFX9-NEXT: v_mul_f32_e32 v18, v2, v18 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NEXT: v_xor_b32_sdwa v11, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v15, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v9, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v11 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v14 +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v16 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, 1, v11 +; GFX9-NEXT: v_or_b32_e32 v11, 1, v14 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v13 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v9 +; GFX9-NEXT: v_mul_f32_e32 v14, v10, v14 +; GFX9-NEXT: v_mul_f32_e32 v16, v12, v16 +; GFX9-NEXT: v_trunc_f32_e32 v14, v14 +; GFX9-NEXT: v_mul_f32_e32 v17, v10, v17 +; GFX9-NEXT: v_mul_f32_e32 v18, v3, v18 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 +; GFX9-NEXT: v_mad_f32 v19, -v14, v3, v10 ; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2| -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9 -; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15 +; GFX9-NEXT: v_mad_f32 v12, -v16, v13, v12 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v3| +; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14 ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3 +; GFX9-NEXT: v_mad_f32 v10, -v17, v15, v10 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2 +; GFX9-NEXT: v_mad_f32 v3, -v18, v9, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX9-NEXT: v_or_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13| -; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc -; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 -; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v12|, |v13| +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v10|, |v15| +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v9| +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v11, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v14, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 +; GFX9-NEXT: v_add_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v1, off ; GFX9-NEXT: global_store_dword v[7:8], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/rotate-add.ll b/llvm/test/CodeGen/AMDGPU/rotate-add.ll index 53a49c9a21e2c..25346d8923a83 100644 --- a/llvm/test/CodeGen/AMDGPU/rotate-add.ll +++ b/llvm/test/CodeGen/AMDGPU/rotate-add.ll @@ -44,15 +44,19 @@ define i32 @test_rotl_var(i32 %x, i32 %y) { ; SI-LABEL: test_rotl_var: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_rotl_var: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 -; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %shl = shl i32 %x, %y %sub = sub i32 32, %y @@ -65,13 +69,19 @@ define i32 @test_rotr_var(i32 %x, i32 %y) { ; SI-LABEL: test_rotr_var: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 +; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_rotr_var: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %shr = lshr i32 %x, %y %sub = sub i32 32, %y @@ -164,13 +174,21 @@ define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { ; SI-LABEL: test_fshr_special_case: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_xor_b32_e32 v2, 31, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_fshr_special_case: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: v_xor_b32_e32 v2, 31, v2 +; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %shl = lshr i32 %x1, %y %srli = shl i32 %x0, 1 @@ -259,11 +277,13 @@ define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { ; SI-LABEL: test_rotl_mul_with_mask_special_case: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; SI-NEXT: v_mul_lo_u32 v1, v1, 9 -; SI-NEXT: v_mul_hi_u32 v2, v0, 9 -; SI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 25 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_mul_hi_u32 v0, v0, 9 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0x80, v2 +; SI-NEXT: v_lshrrev_b32_e32 v0, 25, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -272,9 +292,11 @@ define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mul_lo_u32 v1, v1, 9 ; VI-NEXT: v_mul_hi_u32 v2, v0, 9 +; VI-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; VI-NEXT: v_and_b32_e32 v0, 0x80, v0 ; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 -; VI-NEXT: v_alignbit_b32 v0, v0, v1, 25 -; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 25, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_setpc_b64 s[30:31] %lhs_mul = mul i64 %i, 1152 @@ -289,16 +311,16 @@ define i32 @test_fshl_with_mask_special_case(i32 %x) { ; SI-LABEL: test_fshl_with_mask_special_case: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, 1, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 27 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 27 +; SI-NEXT: v_or_b32_e32 v0, 32, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffffffe1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_fshl_with_mask_special_case: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v1, 1, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 27 +; VI-NEXT: v_alignbit_b32 v0, v0, v0, 27 +; VI-NEXT: v_or_b32_e32 v0, 32, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xffffffe1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %or1 = or i32 %x, 1 diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 0a746b0a3f572..008d8cef23797 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -25,12 +25,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s2, s3 ; SI-NEXT: s_sub_i32 s3, 32, s3 +; SI-NEXT: s_lshr_b32 s2, s2, s3 +; SI-NEXT: s_or_b32 s2, s4, s2 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -38,11 +40,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s4, s2, s3 ; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 +; GFX8-NEXT: s_lshr_b32 s2, s2, s3 +; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -51,19 +55,24 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: s_sub_i32 s4, 32, s3 +; GFX10-NEXT: s_lshl_b32 s3, s2, s3 +; GFX10-NEXT: s_lshr_b32 s2, s2, s4 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s3, 32, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX11-NEXT: s_sub_i32 s4, 32, s3 +; GFX11-NEXT: s_lshl_b32 s3, s2, s3 +; GFX11-NEXT: s_lshr_b32 s2, s2, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm entry: @@ -97,14 +106,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s6, s0, s2 +; SI-NEXT: s_lshl_b32 s8, s1, s3 ; SI-NEXT: s_sub_i32 s3, 32, s3 ; SI-NEXT: s_sub_i32 s2, 32, s2 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_lshr_b32 s0, s0, s2 +; SI-NEXT: s_lshr_b32 s1, s1, s3 +; SI-NEXT: s_or_b32 s1, s8, s1 +; SI-NEXT: s_or_b32 s0, s6, s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -113,13 +126,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 32, s2 +; GFX8-NEXT: s_lshl_b32 s6, s0, s2 +; GFX8-NEXT: s_lshl_b32 s7, s1, s3 ; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: s_sub_i32 s2, 32, s2 +; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_or_b32 s1, s7, s1 +; GFX8-NEXT: s_or_b32 s0, s6, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -131,10 +148,16 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 +; GFX10-NEXT: s_lshl_b32 s4, s0, s2 +; GFX10-NEXT: s_lshl_b32 s5, s1, s3 ; GFX10-NEXT: s_sub_i32 s2, 32, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: s_sub_i32 s3, 32, s3 +; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: s_or_b32 s1, s5, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -143,12 +166,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_lshl_b32 s6, s0, s2 +; GFX11-NEXT: s_lshl_b32 s7, s1, s3 ; GFX11-NEXT: s_sub_i32 s2, 32, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_lshr_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_or_b32 s1, s7, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm entry: @@ -188,20 +217,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s4, 32, s12 -; SI-NEXT: s_sub_i32 s5, 32, s13 -; SI-NEXT: s_sub_i32 s6, 32, s15 -; SI-NEXT: s_sub_i32 s7, 32, s14 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; SI-NEXT: s_lshl_b32 s2, s8, s12 +; SI-NEXT: s_lshl_b32 s4, s9, s13 +; SI-NEXT: s_lshl_b32 s5, s10, s14 +; SI-NEXT: s_lshl_b32 s6, s11, s15 +; SI-NEXT: s_sub_i32 s7, 32, s15 +; SI-NEXT: s_sub_i32 s14, 32, s14 +; SI-NEXT: s_sub_i32 s13, 32, s13 +; SI-NEXT: s_sub_i32 s12, 32, s12 +; SI-NEXT: s_lshr_b32 s8, s8, s12 +; SI-NEXT: s_lshr_b32 s9, s9, s13 +; SI-NEXT: s_lshr_b32 s10, s10, s14 +; SI-NEXT: s_lshr_b32 s7, s11, s7 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s5, s10 +; SI-NEXT: s_or_b32 s4, s4, s9 +; SI-NEXT: s_or_b32 s7, s2, s8 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -210,19 +247,27 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s5, 32, s15 -; GFX8-NEXT: s_sub_i32 s4, 32, s14 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: s_sub_i32 s3, 32, s13 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_sub_i32 s2, 32, s12 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_lshl_b32 s2, s8, s12 +; GFX8-NEXT: s_lshl_b32 s3, s9, s13 +; GFX8-NEXT: s_sub_i32 s6, 32, s15 +; GFX8-NEXT: s_sub_i32 s7, 32, s14 +; GFX8-NEXT: s_sub_i32 s13, 32, s13 +; GFX8-NEXT: s_sub_i32 s12, 32, s12 +; GFX8-NEXT: s_lshl_b32 s4, s10, s14 +; GFX8-NEXT: s_lshl_b32 s5, s11, s15 +; GFX8-NEXT: s_lshr_b32 s8, s8, s12 +; GFX8-NEXT: s_lshr_b32 s9, s9, s13 +; GFX8-NEXT: s_lshr_b32 s7, s10, s7 +; GFX8-NEXT: s_lshr_b32 s6, s11, s6 +; GFX8-NEXT: s_or_b32 s5, s5, s6 +; GFX8-NEXT: s_or_b32 s4, s4, s7 +; GFX8-NEXT: s_or_b32 s3, s3, s9 +; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -234,14 +279,26 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s2, 32, s12 -; GFX10-NEXT: s_sub_i32 s3, 32, s13 -; GFX10-NEXT: s_sub_i32 s4, 32, s15 -; GFX10-NEXT: s_sub_i32 s5, 32, s14 -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2 +; GFX10-NEXT: s_lshl_b32 s2, s8, s12 +; GFX10-NEXT: s_lshl_b32 s3, s9, s13 +; GFX10-NEXT: s_sub_i32 s6, 32, s15 +; GFX10-NEXT: s_sub_i32 s7, 32, s14 +; GFX10-NEXT: s_sub_i32 s12, 32, s12 +; GFX10-NEXT: s_sub_i32 s13, 32, s13 +; GFX10-NEXT: s_lshl_b32 s4, s10, s14 +; GFX10-NEXT: s_lshl_b32 s5, s11, s15 +; GFX10-NEXT: s_lshr_b32 s8, s8, s12 +; GFX10-NEXT: s_lshr_b32 s9, s9, s13 +; GFX10-NEXT: s_lshr_b32 s6, s11, s6 +; GFX10-NEXT: s_lshr_b32 s7, s10, s7 +; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: s_or_b32 s3, s3, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -250,16 +307,27 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s2, 32, s12 -; GFX11-NEXT: s_sub_i32 s3, 32, s13 -; GFX11-NEXT: s_sub_i32 s4, 32, s15 -; GFX11-NEXT: s_sub_i32 s5, 32, s14 -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2 +; GFX11-NEXT: s_lshl_b32 s2, s8, s12 +; GFX11-NEXT: s_lshl_b32 s3, s9, s13 +; GFX11-NEXT: s_sub_i32 s6, 32, s15 +; GFX11-NEXT: s_sub_i32 s7, 32, s14 +; GFX11-NEXT: s_sub_i32 s12, 32, s12 +; GFX11-NEXT: s_sub_i32 s13, 32, s13 +; GFX11-NEXT: s_lshl_b32 s4, s10, s14 +; GFX11-NEXT: s_lshl_b32 s5, s11, s15 +; GFX11-NEXT: s_lshr_b32 s8, s8, s12 +; GFX11-NEXT: s_lshr_b32 s9, s9, s13 +; GFX11-NEXT: s_lshr_b32 s6, s11, s6 +; GFX11-NEXT: s_lshr_b32 s7, s10, s7 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s4, s4, s7 +; GFX11-NEXT: s_or_b32 s2, s2, s8 +; GFX11-NEXT: s_or_b32 s3, s3, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index d6e361d6e297e..cb1266a048764 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; R600-LABEL: rotr_i32: @@ -22,12 +23,15 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_sub_i32 s4, 32, s3 +; SI-NEXT: s_lshr_b32 s3, s2, s3 +; SI-NEXT: s_lshl_b32 s2, s2, s4 +; SI-NEXT: s_or_b32 s2, s2, s3 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -35,10 +39,13 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 +; GFX8-NEXT: s_sub_i32 s4, 32, s3 +; GFX8-NEXT: s_lshr_b32 s3, s2, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -47,18 +54,41 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX10-NEXT: s_sub_i32 s4, 32, s3 +; GFX10-NEXT: s_lshl_b32 s4, s2, s4 +; GFX10-NEXT: s_lshr_b32 s2, s2, s3 +; GFX10-NEXT: s_or_b32 s2, s4, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 +; GFX11-NEXT: s_sub_i32 s4, 32, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s2, s4 +; GFX11-NEXT: s_lshr_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s2, s4, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rotr_i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_i32 s4, 32, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_lshl_b32 s4, s2, s4 +; GFX12-NEXT: s_lshr_b32 s2, s2, s3 +; GFX12-NEXT: s_or_b32 s2, s4, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = sub i32 32, %y %tmp1 = shl i32 %x, %tmp0 @@ -86,12 +116,18 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_sub_i32 s6, 32, s3 +; SI-NEXT: s_sub_i32 s8, 32, s2 +; SI-NEXT: s_lshr_b32 s2, s0, s2 +; SI-NEXT: s_lshr_b32 s3, s1, s3 +; SI-NEXT: s_lshl_b32 s0, s0, s8 +; SI-NEXT: s_lshl_b32 s1, s1, s6 +; SI-NEXT: s_or_b32 s1, s1, s3 +; SI-NEXT: s_or_b32 s0, s0, s2 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -100,11 +136,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 +; GFX8-NEXT: s_sub_i32 s6, 32, s3 +; GFX8-NEXT: s_sub_i32 s7, 32, s2 +; GFX8-NEXT: s_lshr_b32 s2, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s7 +; GFX8-NEXT: s_lshl_b32 s6, s1, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_or_b32 s1, s6, s1 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -116,8 +158,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX10-NEXT: s_sub_i32 s4, 32, s3 +; GFX10-NEXT: s_sub_i32 s5, 32, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, s2 +; GFX10-NEXT: s_lshr_b32 s3, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s1, s1, s4 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm ; @@ -126,12 +176,40 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 +; GFX11-NEXT: s_sub_i32 s6, 32, s3 +; GFX11-NEXT: s_sub_i32 s7, 32, s2 +; GFX11-NEXT: s_lshr_b32 s2, s0, s2 +; GFX11-NEXT: s_lshr_b32 s3, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, s7 +; GFX11-NEXT: s_lshl_b32 s1, s1, s6 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rotr_v2i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_i32 s6, 32, s3 +; GFX12-NEXT: s_sub_co_i32 s7, 32, s2 +; GFX12-NEXT: s_lshr_b32 s2, s0, s2 +; GFX12-NEXT: s_lshr_b32 s3, s1, s3 +; GFX12-NEXT: s_lshl_b32 s0, s0, s7 +; GFX12-NEXT: s_lshl_b32 s1, s1, s6 +; GFX12-NEXT: s_or_b32 s0, s0, s2 +; GFX12-NEXT: s_or_b32 s1, s1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_endpgm entry: %tmp0 = sub <2 x i32> , %y %tmp1 = shl <2 x i32> %x, %tmp0 @@ -161,16 +239,28 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; SI-NEXT: v_mov_b32_e32 v0, s14 -; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: s_sub_i32 s2, 32, s15 +; SI-NEXT: s_sub_i32 s4, 32, s14 +; SI-NEXT: s_sub_i32 s5, 32, s13 +; SI-NEXT: s_sub_i32 s6, 32, s12 +; SI-NEXT: s_lshr_b32 s7, s8, s12 +; SI-NEXT: s_lshr_b32 s12, s9, s13 +; SI-NEXT: s_lshr_b32 s13, s10, s14 +; SI-NEXT: s_lshr_b32 s14, s11, s15 +; SI-NEXT: s_lshl_b32 s6, s8, s6 +; SI-NEXT: s_lshl_b32 s5, s9, s5 +; SI-NEXT: s_lshl_b32 s4, s10, s4 +; SI-NEXT: s_lshl_b32 s2, s11, s2 +; SI-NEXT: s_or_b32 s8, s2, s14 +; SI-NEXT: s_or_b32 s4, s4, s13 +; SI-NEXT: s_or_b32 s5, s5, s12 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -179,15 +269,27 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s15 -; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 -; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 -; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: s_sub_i32 s2, 32, s15 +; GFX8-NEXT: s_sub_i32 s3, 32, s14 +; GFX8-NEXT: s_sub_i32 s4, 32, s13 +; GFX8-NEXT: s_sub_i32 s5, 32, s12 +; GFX8-NEXT: s_lshl_b32 s5, s8, s5 +; GFX8-NEXT: s_lshl_b32 s4, s9, s4 +; GFX8-NEXT: s_lshl_b32 s3, s10, s3 +; GFX8-NEXT: s_lshl_b32 s2, s11, s2 +; GFX8-NEXT: s_lshr_b32 s6, s8, s12 +; GFX8-NEXT: s_lshr_b32 s7, s9, s13 +; GFX8-NEXT: s_lshr_b32 s8, s10, s14 +; GFX8-NEXT: s_lshr_b32 s9, s11, s15 +; GFX8-NEXT: s_or_b32 s2, s2, s9 +; GFX8-NEXT: s_or_b32 s3, s3, s8 +; GFX8-NEXT: s_or_b32 s4, s4, s7 +; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -199,10 +301,26 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s15 -; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s14 -; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s13 -; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s12 +; GFX10-NEXT: s_sub_i32 s2, 32, s15 +; GFX10-NEXT: s_sub_i32 s3, 32, s14 +; GFX10-NEXT: s_sub_i32 s4, 32, s13 +; GFX10-NEXT: s_sub_i32 s5, 32, s12 +; GFX10-NEXT: s_lshr_b32 s6, s8, s12 +; GFX10-NEXT: s_lshr_b32 s7, s9, s13 +; GFX10-NEXT: s_lshr_b32 s12, s10, s14 +; GFX10-NEXT: s_lshr_b32 s13, s11, s15 +; GFX10-NEXT: s_lshl_b32 s5, s8, s5 +; GFX10-NEXT: s_lshl_b32 s4, s9, s4 +; GFX10-NEXT: s_lshl_b32 s2, s11, s2 +; GFX10-NEXT: s_lshl_b32 s3, s10, s3 +; GFX10-NEXT: s_or_b32 s2, s2, s13 +; GFX10-NEXT: s_or_b32 s3, s3, s12 +; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -211,14 +329,58 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s15 -; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s14 -; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s13 -; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s12 +; GFX11-NEXT: s_sub_i32 s2, 32, s15 +; GFX11-NEXT: s_sub_i32 s3, 32, s14 +; GFX11-NEXT: s_sub_i32 s4, 32, s13 +; GFX11-NEXT: s_sub_i32 s5, 32, s12 +; GFX11-NEXT: s_lshr_b32 s6, s8, s12 +; GFX11-NEXT: s_lshr_b32 s7, s9, s13 +; GFX11-NEXT: s_lshr_b32 s12, s10, s14 +; GFX11-NEXT: s_lshr_b32 s13, s11, s15 +; GFX11-NEXT: s_lshl_b32 s5, s8, s5 +; GFX11-NEXT: s_lshl_b32 s4, s9, s4 +; GFX11-NEXT: s_lshl_b32 s2, s11, s2 +; GFX11-NEXT: s_lshl_b32 s3, s10, s3 +; GFX11-NEXT: s_or_b32 s2, s2, s13 +; GFX11-NEXT: s_or_b32 s3, s3, s12 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_or_b32 s4, s4, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: rotr_v4i32: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_sub_co_i32 s2, 32, s15 +; GFX12-NEXT: s_sub_co_i32 s3, 32, s14 +; GFX12-NEXT: s_sub_co_i32 s4, 32, s13 +; GFX12-NEXT: s_sub_co_i32 s5, 32, s12 +; GFX12-NEXT: s_lshr_b32 s6, s8, s12 +; GFX12-NEXT: s_lshr_b32 s7, s9, s13 +; GFX12-NEXT: s_lshr_b32 s12, s10, s14 +; GFX12-NEXT: s_lshr_b32 s13, s11, s15 +; GFX12-NEXT: s_lshl_b32 s5, s8, s5 +; GFX12-NEXT: s_lshl_b32 s4, s9, s4 +; GFX12-NEXT: s_lshl_b32 s2, s11, s2 +; GFX12-NEXT: s_lshl_b32 s3, s10, s3 +; GFX12-NEXT: s_or_b32 s2, s2, s13 +; GFX12-NEXT: s_or_b32 s3, s3, s12 +; GFX12-NEXT: s_or_b32 s5, s5, s6 +; GFX12-NEXT: s_or_b32 s4, s4, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s2 +; GFX12-NEXT: v_mov_b32_e32 v2, s3 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX12-NEXT: s_endpgm entry: %tmp0 = sub <4 x i32> , %y %tmp1 = shl <4 x i32> %x, %tmp0 @@ -357,6 +519,25 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-FAKE16-NEXT: global_store_b16 v[4:5], v0, off offset:8 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: test_rotr_i16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_u16 v2, v[2:3], off offset:48 +; GFX12-NEXT: global_load_u16 v0, v[0:1], off offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: v_sub_nc_u16 v1, 0, v2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshrrev_b16 v2, v2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b16 v0, v1, v0 +; GFX12-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX12-NEXT: global_store_b16 v[4:5], v0, off offset:8 +; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16 %a = load i16, ptr addrspace(1) %arrayidx diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 3fbfd756b97e6..107fc8aaa86a1 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -17,8 +17,9 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -39,8 +40,8 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -85,8 +86,9 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_mov_b32_e32 v1, v0 @@ -107,8 +109,8 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll b/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll index aa39797d74a10..a1710a9b5a7a8 100644 --- a/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll +++ b/llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll @@ -1,15 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; This test checks that the address space casts for SPIR-V generic pointer casts ; are lowered correctly by the infer-address-spaces pass. ; RUN: opt < %s -passes=infer-address-spaces -S --mtriple=spirv64-unknown-unknown | FileCheck %s -; Casting a global pointer to a global pointer. +; Casting a global pointer to a global pointer. ; The uses of c2 will be replaced with %global. ; CHECK: @kernel1(ptr addrspace(1) %global) define i1 @kernel1(ptr addrspace(1) %global) { %c1 = addrspacecast ptr addrspace(1) %global to ptr addrspace(4) %c2 = call ptr addrspace(1) @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1) ; CHECK: %b1 = icmp eq ptr addrspace(1) %global, null - %b1 = icmp eq ptr addrspace(1) %c2, null + %b1 = icmp eq ptr addrspace(1) %c2, null ret i1 %b1 } @@ -31,7 +32,7 @@ define i1 @kernel3(ptr addrspace(1) %global) { %c1 = addrspacecast ptr addrspace(1) %global to ptr addrspace(4) %c2 = call ptr @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1) ; CHECK: %b1 = icmp eq ptr null, null - %b1 = icmp eq ptr %c2, null + %b1 = icmp eq ptr %c2, null ret i1 %b1 } @@ -42,7 +43,7 @@ define i1 @kernel4(ptr addrspace(3) %local) { %c1 = addrspacecast ptr addrspace(3) %local to ptr addrspace(4) %c2 = call ptr addrspace(3) @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1) ; CHECK: %b1 = icmp eq ptr addrspace(3) %local, null - %b1 = icmp eq ptr addrspace(3) %c2, null + %b1 = icmp eq ptr addrspace(3) %c2, null ret i1 %b1 } @@ -53,7 +54,7 @@ define i1 @kernel5(ptr addrspace(3) %local) { %c1 = addrspacecast ptr addrspace(3) %local to ptr addrspace(4) %c2 = call ptr addrspace(1) @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1) ; CHECK: %b1 = icmp eq ptr addrspace(1) null, null - %b1 = icmp eq ptr addrspace(1) %c2, null + %b1 = icmp eq ptr addrspace(1) %c2, null ret i1 %b1 } @@ -64,7 +65,7 @@ define i1 @kernel6(ptr addrspace(3) %local) { %c1 = addrspacecast ptr addrspace(3) %local to ptr addrspace(4) %c2 = call ptr @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1) ; CHECK: %b1 = icmp eq ptr null, null - %b1 = icmp eq ptr %c2, null + %b1 = icmp eq ptr %c2, null ret i1 %b1 } @@ -75,7 +76,7 @@ define i1 @kernel7(ptr %private) { %c1 = addrspacecast ptr %private to ptr addrspace(4) %c2 = call ptr @llvm.spv.generic.cast.to.ptr.explicit(ptr addrspace(4) %c1) ; CHECK: %b1 = icmp eq ptr %private, null - %b1 = icmp eq ptr %c2, null + %b1 = icmp eq ptr %c2, null ret i1 %b1 }