diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index cbdc1b6031680..3239b35031e36 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3209,10 +3209,12 @@ class LLVM_ABI TargetLoweringBase { /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. + /// \p MaskFactor is the interleave factor that considers mask, which can + /// reduce the original factor. virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const { + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 5e508989ef2da..7c3b0db50f2ad 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -268,13 +268,19 @@ static Value *getMaskOperand(IntrinsicInst *II) { } } -// Return the corresponded deinterleaved mask, or nullptr if there is no valid -// mask. -static Value *getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC); - -static Value *getMask(Value *WideMask, unsigned Factor, - VectorType *LeafValueTy) { +// Return a pair of +// (1) The corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +// (2) Some mask effectively skips a certain field, this element contains +// the factor after taking such contraction into consideration. Note that +// currently we only support skipping trailing fields. So if the "nominal" +// factor was 5, you cannot only skip field 1 and 2, but you can skip field 3 +// and 4. +static std::pair getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC); + +static std::pair getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { return getMask(WideMask, Factor, LeafValueTy->getElementCount()); } @@ -379,22 +385,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); Value *Mask = nullptr; + unsigned GapMaskFactor = Factor; if (LI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, VecTy); + std::tie(Mask, GapMaskFactor) = getMask(getMaskOperand(II), Factor, VecTy); if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " << *Load << "\n"); + LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor + << " and mask factor " << GapMaskFactor << "\n"); } // Try to create target specific intrinsics to replace the load and // shuffles. if (!TLI->lowerInterleavedLoad(cast(Load), Mask, Shuffles, - Indices, Factor)) + Indices, Factor, GapMaskFactor)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; @@ -531,15 +540,20 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); Value *Mask = nullptr; + unsigned GapMaskFactor = Factor; if (SI) { LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); } else { // Check mask operand. Handle both all-true/false and interleaved mask. unsigned LaneMaskLen = NumStoredElements / Factor; - Mask = getMask(getMaskOperand(II), Factor, - ElementCount::getFixed(LaneMaskLen)); + std::tie(Mask, GapMaskFactor) = getMask( + getMaskOperand(II), Factor, ElementCount::getFixed(LaneMaskLen)); if (!Mask) return false; + // We shouldn't transform stores even it has a gap mask. And since we might + // already change the IR, we're returning true here. + if (GapMaskFactor != Factor) + return true; LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " << *Store << "\n"); @@ -556,34 +570,87 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } -static Value *getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC) { +// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the +// last field in a factor-of-three interleaved store or deinterleaved load (in +// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask. +// This helper function tries to detect this pattern and return the actual +// factor we're accessing, which is 2 in this example. +static unsigned getGapMaskFactor(const Constant &MaskConst, unsigned Factor, + unsigned LeafMaskLen) { + APInt FactorMask(Factor, 0); + FactorMask.setAllBits(); + for (unsigned F = 0U; F < Factor; ++F) { + bool AllZero = true; + for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) { + Constant *C = MaskConst.getAggregateElement(F + Idx * Factor); + if (!C->isZeroValue()) { + AllZero = false; + break; + } + } + // All mask bits on this field are zero, skipping it. + if (AllZero) + FactorMask.clearBit(F); + } + // We currently only allow gaps in the "trailing" factors / fields. So + // given the original factor being 4, we can skip fields 2 and 3, but we + // cannot only skip fields 1 and 2. If FactorMask does not match such + // pattern, reset it. + if (!FactorMask.isMask()) + FactorMask.setAllBits(); + + return FactorMask.popcount(); +} + +static std::pair getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC) { + using namespace PatternMatch; + if (auto *IMI = dyn_cast(WideMask)) { if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID()); F && F == Factor && llvm::all_equal(IMI->args())) { - return IMI->getArgOperand(0); + return {IMI->getArgOperand(0), Factor}; } } + // Try to match `and , `. The WideMask here is + // expected to be a fixed vector and gap mask should be a constant mask. + Value *AndMaskLHS; + Constant *AndMaskRHS; + if (match(WideMask, m_c_And(m_Value(AndMaskLHS), m_Constant(AndMaskRHS))) && + LeafValueEC.isFixed()) { + assert(!isa(AndMaskLHS) && + "expect constants to be folded already"); + return {getMask(AndMaskLHS, Factor, LeafValueEC).first, + getGapMaskFactor(*AndMaskRHS, Factor, LeafValueEC.getFixedValue())}; + } + if (auto *ConstMask = dyn_cast(WideMask)) { if (auto *Splat = ConstMask->getSplatValue()) // All-ones or all-zeros mask. - return ConstantVector::getSplat(LeafValueEC, Splat); + return {ConstantVector::getSplat(LeafValueEC, Splat), Factor}; if (LeafValueEC.isFixed()) { unsigned LeafMaskLen = LeafValueEC.getFixedValue(); + // First, check if we use a gap mask to skip some of the factors / fields. + const unsigned GapMaskFactor = + getGapMaskFactor(*ConstMask, Factor, LeafMaskLen); + assert(GapMaskFactor <= Factor); + SmallVector LeafMask(LeafMaskLen, nullptr); // If this is a fixed-length constant mask, each lane / leaf has to // use the same mask. This is done by checking if every group with Factor // number of elements in the interleaved mask has homogeneous values. for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { + if (Idx % Factor >= GapMaskFactor) + continue; Constant *C = ConstMask->getAggregateElement(Idx); if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) - return nullptr; + return {nullptr, Factor}; LeafMask[Idx / Factor] = C; } - return ConstantVector::get(LeafMask); + return {ConstantVector::get(LeafMask), GapMaskFactor}; } } @@ -603,12 +670,13 @@ static Value *getMask(Value *WideMask, unsigned Factor, auto *LeafMaskTy = VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC); IRBuilder<> Builder(SVI); - return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), - uint64_t(0)); + return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), + uint64_t(0)), + Factor}; } } - return nullptr; + return {nullptr, Factor}; } bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( @@ -639,9 +707,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); + unsigned GapMaskFactor; + std::tie(Mask, GapMaskFactor) = + getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; + assert(GapMaskFactor == Factor); LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave" << " intrinsic " << *DI << " and factor = " @@ -680,10 +751,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( II->getIntrinsicID() != Intrinsic::vp_store) return false; // Check mask operand. Handle both all-true/false and interleaved mask. - Mask = getMask(getMaskOperand(II), Factor, - cast(InterleaveValues[0]->getType())); + unsigned GapMaskFactor; + std::tie(Mask, GapMaskFactor) = + getMask(getMaskOperand(II), Factor, + cast(InterleaveValues[0]->getType())); if (!Mask) return false; + assert(GapMaskFactor == Factor); LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave" << " intrinsic " << *IntII << " and factor = " diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2b6ea86ee1af5..632bb79fa02e4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -17266,6 +17266,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad( return false; assert(!Mask && "Unexpected mask on a load"); + if (Factor != MaskFactor) + return false; + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ea63edd86210e..d0d6512d39015 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -220,8 +220,8 @@ class AArch64TargetLowering : public TargetLowering { bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 936625606e315..c087e32cd4787 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -21611,6 +21611,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( return false; assert(!Mask && "Unexpected mask on a load"); + if (Factor != MaskFactor) + return false; + auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 825145d813fb1..670bbb62fe0f6 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -683,8 +683,8 @@ class VectorType; bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index fa50e2105a708..4155f613f7f04 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -431,8 +431,8 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 726920e4015cf..d4e6351ea6a51 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = { Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask, Intrinsic::riscv_seg8_load_mask}; +static const Intrinsic::ID FixedVlssegIntrIds[] = { + Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask, + Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask, + Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask, + Intrinsic::riscv_sseg8_load_mask}; + static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, @@ -197,9 +203,13 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Indices.size() == Shuffles.size()); + assert(MaskFactor <= Factor); + // TODO: Lower to strided load when MaskFactor = 1. + if (MaskFactor < 2) + return false; IRBuilder<> Builder(Load); const DataLayout &DL = Load->getDataLayout(); @@ -208,20 +218,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Value *Ptr, *VL; Align Alignment; - if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) + if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); - if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL)) return false; - CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); + CallInst *SegLoad = nullptr; + if (MaskFactor < Factor) { + // Lower to strided segmented load. + unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2], + {VTy, PtrTy, XLenTy, XLenTy}, + {Ptr, Stride, Mask, VL}); + } else { + // Lower to normal segmented load. + SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], + {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); + } for (unsigned i = 0; i < Shuffles.size(); i++) { - Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); - Shuffles[i]->replaceAllUsesWith(SubVec); + unsigned FactorIdx = Indices[i]; + if (FactorIdx >= MaskFactor) { + // Replace masked-off factors (that are still extracted) with poison. + Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy)); + } else { + Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx); + Shuffles[i]->replaceAllUsesWith(SubVec); + } } return true; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 547b2210fdbf0..242d24b5faf60 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1663,8 +1663,8 @@ namespace llvm { /// instructions/intrinsics. bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, - unsigned Factor) const override; + ArrayRef Indices, unsigned Factor, + unsigned MaskFactor) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 636b072837441..52132a9d64b1a 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( Instruction *Load, Value *Mask, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { + ArrayRef Indices, unsigned Factor, unsigned MaskFactor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); @@ -814,6 +814,9 @@ bool X86TargetLowering::lowerInterleavedLoad( return false; assert(!Mask && "Unexpected mask on a load"); + if (Factor != MaskFactor) + return false; + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 6eb0b693b5546..2c738e5aeb55b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -332,6 +332,59 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } +define {<4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) { + ; mask = 1111, skip the last field. +; CHECK-LABEL: vpload_factor3_skip_fields: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1 +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + +define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) { + ; mask = 0101, skip the last field. +; CHECK-LABEL: vpload_factor3_mask_skip_fields: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + +define {<4 x i32>, <4 x i32>} @vpload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) { +; CHECK-LABEL: vpload_factor3_combined_mask_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> + %combined = and <12 x i1> %interleaved.mask, + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> %combined, i32 12) + ; mask = %mask, skip the last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { ; CHECK-LABEL: vpload_factor4: ; CHECK: # %bb.0: @@ -479,8 +532,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI23_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI23_0) +; RV32-NEXT: lui a7, %hi(.LCPI26_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI26_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -565,12 +618,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI23_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_1) +; RV32-NEXT: lui a1, %hi(.LCPI26_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI23_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI23_3) +; RV32-NEXT: lui a4, %hi(.LCPI26_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI26_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -749,8 +802,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI23_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_2) +; RV32-NEXT: lui a1, %hi(.LCPI26_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -814,16 +867,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI23_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_4) -; RV32-NEXT: lui a2, %hi(.LCPI23_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI23_5) +; RV32-NEXT: lui a1, %hi(.LCPI26_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_4) +; RV32-NEXT: lui a2, %hi(.LCPI26_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI26_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI23_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_7) +; RV32-NEXT: lui a1, %hi(.LCPI26_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -851,14 +904,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI23_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_6) -; RV32-NEXT: lui a2, %hi(.LCPI23_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI23_8) +; RV32-NEXT: lui a1, %hi(.LCPI26_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_6) +; RV32-NEXT: lui a2, %hi(.LCPI26_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI26_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI23_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI23_9) +; RV32-NEXT: lui a1, %hi(.LCPI26_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI26_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -945,8 +998,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI23_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI23_0) +; RV64-NEXT: lui a3, %hi(.LCPI26_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI26_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -1134,8 +1187,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI23_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI23_1) +; RV64-NEXT: lui a2, %hi(.LCPI26_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI26_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -1169,8 +1222,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI23_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI23_2) +; RV64-NEXT: lui a2, %hi(.LCPI26_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI26_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1254,12 +1307,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI23_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI23_3) +; RV64-NEXT: lui a1, %hi(.LCPI26_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI26_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI23_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI23_4) +; RV64-NEXT: lui a1, %hi(.LCPI26_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI26_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1310,8 +1363,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI23_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI23_5) +; RV64-NEXT: lui a1, %hi(.LCPI26_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI26_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1928,8 +1981,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI59_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI59_0) +; RV32-NEXT: lui a1, %hi(.LCPI62_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI62_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -2004,8 +2057,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI60_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI60_0) +; RV32-NEXT: lui a0, %hi(.LCPI63_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI63_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 @@ -2094,3 +2147,175 @@ define void @maskedstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) { tail call void @llvm.masked.store(<8 x i32> %interleaved.vec, ptr %ptr, i32 4, <8 x i1> splat (i1 true)) ret void } + +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) { +; CHECK-LABEL: maskedload_factor3_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + ; mask = 1010 + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) { +; CHECK-LABEL: maskedload_factor3_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1 +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + ; mask = 1111, skip last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + +define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) { +; CHECK-LABEL: maskedload_factor3_mask_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + ; mask = 1010, skip the last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + +define {<4 x i32>, <4 x i32>} @maskedload_factor3_combined_mask_skip_field(ptr %ptr, <4 x i1> %mask) { +; CHECK-LABEL: maskedload_factor3_combined_mask_skip_field: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret + %interleaved.mask = shufflevector <4 x i1> %mask, <4 x i1> poison, <12 x i32> + %combined = and <12 x i1> %interleaved.mask, + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> %combined, <12 x i32> poison) + ; mask = %mask, skip the last field + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + +; We can only skip the last field for now. +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(ptr %ptr) { +; RV32-LABEL: maskedload_factor3_invalid_skip_field: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: lui a1, 1 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: addi a1, a1, -1171 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 146 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v20, a1 +; RV32-NEXT: lui a1, %hi(.LCPI70_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI70_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v21, (a1) +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v20 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v21 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: maskedload_factor3_invalid_skip_field: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 146 +; RV64-NEXT: vmv.s.x v20, a1 +; RV64-NEXT: lui a1, 1 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: addi a1, a1, -1171 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v20 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> , <12 x i32> poison) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) { + ; mask = 1111, skip the last two fields. +; CHECK-LABEL: maskedload_factor5_skip_fields: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 20 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlsseg3e32.v v8, (a0), a1 +; CHECK-NEXT: ret + %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> , <20 x i32> poison) + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} +