diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 28746bf9d05aa..8ba985b17a255 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -24243,6 +24243,63 @@ Examples: %also.r = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %ptr, i32 2, <8 x i1> %mask, <8 x i8> poison) +.. _int_vp_ff_load: + +'``llvm.vp.ff.load``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare {<4 x float>, i32} @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> %mask, i32 %evl) + declare {, i32} @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, %mask, i32 %evl) + declare {<8 x float>, i32} @llvm.vp.load.ff.v8f32.p1(ptr addrspace(1) %ptr, <8 x i1> %mask, i32 %evl) + declare {, i32} @llvm.vp.load.ff.nxv1i64.p6(ptr addrspace(6) %ptr, %mask, i32 %evl) + +Overview: +""""""""" + +The '``llvm.vp.load.ff.*``' intrinsic is similar to '``llvm.vp.load.*``', but +will not trap if there are not ``evl`` readable elements at the pointer. + +Arguments: +"""""""""" + +The first argument is the base pointer for the load. The second argument is a +vector of boolean values with the same number of elements as the first return +type. The third is the explicit vector length of the operation. The first +return type and underlying type of the base pointer are the same vector types. + +The :ref:`align ` parameter attribute can be provided for the first +argument. + +Semantics: +"""""""""" + +The '``llvm.vp.load.ff``' intrinsic reads a vector from memory similar to +'``llvm.vp.load``, but will only trap if the first lane is unreadable. If +any other lane is unreadable, the number of successfully read lanes will +be returned in the second return value. The result in the first return value +for the lanes that were not successfully read is +:ref:`poison value `. If ``evl`` is 0, no read occurs and thus no +trap can occur for the first lane. If ``mask`` is 0 for the first lane, no +trap occurs. This intrinsic is allowed to read fewer than ``evl`` lanes even +if no trap would occur. If ``evl`` is non-zero, the result in the second result +must be at least 1 even if the first lane is disabled by ``mask``. + +The default alignment is taken as the ABI alignment of the first return +type as specified by the :ref:`datalayout string`. + +Examples: +""""""""" + +.. code-block:: text + + %r = call {<8 x i8>, i32} @llvm.vp.load.ff.v8i8.p0(ptr align 2 %ptr, <8 x i1> %mask, i32 %evl) + .. _int_vp_store: '``llvm.vp.store``' Intrinsic diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h index 84564563de8e3..080757b6d1fe0 100644 --- a/llvm/include/llvm/Analysis/Loads.h +++ b/llvm/include/llvm/Analysis/Loads.h @@ -91,6 +91,14 @@ LLVM_ABI bool isDereferenceableReadOnlyLoop( Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, SmallVectorImpl *Predicates = nullptr); +/// Return true if the loop \p L cannot fault on any iteration and only +/// contains read-only memory accesses. Also collect loads that are not +/// guaranteed to be dereferenceable. +LLVM_ABI bool isReadOnlyLoopWithSafeOrSpeculativeLoads( + Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + SmallVectorImpl *SpeculativeLoads, + SmallVectorImpl *Predicates = nullptr); + /// Return true if we know that executing a load from this value cannot trap. /// /// If DT and ScanFrom are specified this method performs context-sensitive diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 7928835f7f84d..5d562f0e0c180 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1857,6 +1857,9 @@ class TargetTransformInfo { /// \returns True if the target supports scalable vectors. LLVM_ABI bool supportsScalableVectors() const; + /// \returns True if the target supports speculative loads. + LLVM_ABI bool supportsSpeculativeLoads() const; + /// \return true when scalable vectorization is preferred. LLVM_ABI bool enableScalableVectorization() const; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 2ea87b3c62895..5655446ccf3ec 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1106,6 +1106,8 @@ class TargetTransformInfoImplBase { virtual bool supportsScalableVectors() const { return false; } + virtual bool supportsSpeculativeLoads() const { return false; } + virtual bool enableScalableVectorization() const { return false; } virtual bool hasActiveVectorLength() const { return false; } diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index e5644a5ef206a..6f2ad33094dce 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1668,6 +1668,9 @@ class SelectionDAG { ArrayRef Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType); + LLVM_ABI SDValue getLoadFFVP(EVT VT, const SDLoc &DL, SDValue Chain, + SDValue Ptr, SDValue Mask, SDValue EVL, + MachineMemOperand *MMO); LLVM_ABI SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 8f88811be9c01..7fcf66c2c03f1 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -3099,6 +3099,23 @@ class MaskedHistogramSDNode : public MaskedGatherScatterSDNode { } }; +class VPLoadFFSDNode : public MemSDNode { +public: + friend class SelectionDAG; + + VPLoadFFSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : MemSDNode(ISD::VP_LOAD_FF, Order, dl, VTs, MemVT, MMO) {} + + const SDValue &getBasePtr() const { return getOperand(1); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getVectorLength() const { return getOperand(3); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::VP_LOAD_FF; + } +}; + class FPStateAccessSDNode : public MemSDNode { public: friend class SelectionDAG; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index bd6f94ac1286c..14529f173db94 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1932,6 +1932,14 @@ def int_vp_load : DefaultAttrsIntrinsic<[ llvm_anyvector_ty], llvm_i32_ty], [ NoCapture>, IntrReadMem, IntrArgMemOnly ]>; +def int_vp_load_ff + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, llvm_i32_ty], + [llvm_anyptr_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [NoCapture>, IntrNoSync, IntrReadMem, + IntrWillReturn, IntrArgMemOnly]>; + def int_vp_gather: DefaultAttrsIntrinsic<[ llvm_anyvector_ty], [ LLVMVectorOfAnyPointersToElt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 55f4719da7c8b..4a71097226f18 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -587,6 +587,12 @@ VP_PROPERTY_FUNCTIONAL_OPC(Load) VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_load) END_REGISTER_VP(vp_load, VP_LOAD) +BEGIN_REGISTER_VP_INTRINSIC(vp_load_ff, 1, 2) +// val,chain = VP_LOAD_FF chain,base,mask,evl +BEGIN_REGISTER_VP_SDNODE(VP_LOAD_FF, -1, vp_load_ff, 2, 3) +HELPER_MAP_VPID_TO_VPSD(vp_load_ff, VP_LOAD_FF) +VP_PROPERTY_NO_FUNCTIONAL +END_REGISTER_VP(vp_load_ff, VP_LOAD_FF) // llvm.experimental.vp.strided.load(ptr,stride,mask,vlen) BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_load, 2, 3) // chain = EXPERIMENTAL_VP_STRIDED_LOAD chain,base,offset,stride,mask,evl diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index cba37363d0474..8026242ac7fe6 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -453,6 +453,10 @@ class LoopVectorizationLegality { /// Returns a list of all known histogram operations in the loop. bool hasHistograms() const { return !Histograms.empty(); } + const SmallPtrSetImpl &getSpeculativeLoads() const { + return SpeculativeLoads; + } + PredicatedScalarEvolution *getPredicatedScalarEvolution() const { return &PSE; } @@ -645,6 +649,9 @@ class LoopVectorizationLegality { /// may work on the same memory location. SmallVector Histograms; + /// Hold all loads that need to be speculative. + SmallPtrSet SpeculativeLoads; + /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 393f2648de3c9..5774a5c4105bd 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -862,3 +862,19 @@ bool llvm::isDereferenceableReadOnlyLoop( } return true; } + +bool llvm::isReadOnlyLoopWithSafeOrSpeculativeLoads( + Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, + SmallVectorImpl *SpeculativeLoads, + SmallVectorImpl *Predicates) { + for (BasicBlock *BB : L->blocks()) { + for (Instruction &I : *BB) { + if (auto *LI = dyn_cast(&I)) { + if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates)) + SpeculativeLoads->push_back(LI); + } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) + return false; + } + } + return true; +} diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 55ba52a1079ce..6482ba003bf38 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1457,6 +1457,10 @@ bool TargetTransformInfo::supportsScalableVectors() const { return TTIImpl->supportsScalableVectors(); } +bool TargetTransformInfo::supportsSpeculativeLoads() const { + return TTIImpl->supportsSpeculativeLoads(); +} + bool TargetTransformInfo::enableScalableVectorization() const { return TTIImpl->enableScalableVectorization(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 2e13b1854bf29..63544e63e1da1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -971,6 +971,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); @@ -1075,6 +1076,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); + SDValue WidenVecRes_VP_LOAD_FF(VPLoadFFSDNode *N); SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N); SDValue WidenVecRes_VECTOR_COMPRESS(SDNode *N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1661814d5a897..aadda5cd3ba7d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1152,6 +1152,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_LOAD: SplitVecRes_VP_LOAD(cast(N), Lo, Hi); break; + case ISD::VP_LOAD_FF: + SplitVecRes_VP_LOAD_FF(cast(N), Lo, Hi); + break; case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: SplitVecRes_VP_STRIDED_LOAD(cast(N), Lo, Hi); break; @@ -2227,6 +2230,51 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, + SDValue &Hi) { + EVT LoVT, HiVT; + SDLoc dl(LD); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0)); + + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + Align Alignment = LD->getBaseAlign(); + SDValue Mask = LD->getMask(); + SDValue EVL = LD->getVectorLength(); + EVT MemoryVT = LD->getMemoryVT(); + + bool HiIsEmpty = false; + auto [LoMemVT, HiMemVT] = + DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty); + + // Split Mask operand + SDValue MaskLo, MaskHi; + if (Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + } + + // Split EVL operand + auto [EVLLo, EVLHi] = DAG.SplitEVL(EVL, LD->getValueType(0), dl); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + LD->getPointerInfo(), MachineMemOperand::MOLoad, + LocationSize::beforeOrAfterPointer(), Alignment, LD->getAAInfo(), + LD->getRanges()); + + Lo = DAG.getLoadFFVP(LoVT, dl, Ch, Ptr, MaskLo, EVLLo, MMO); + + // Fill the upper half with poison. + Hi = DAG.getUNDEF(HiVT); + + ReplaceValueWith(SDValue(LD, 1), Lo.getValue(1)); + ReplaceValueWith(SDValue(LD, 2), Lo.getValue(2)); +} + void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo, SDValue &Hi) { assert(SLD->isUnindexed() && @@ -4707,6 +4755,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_LOAD: Res = WidenVecRes_VP_LOAD(cast(N)); break; + case ISD::VP_LOAD_FF: + Res = WidenVecRes_VP_LOAD_FF(cast(N)); + break; case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: Res = WidenVecRes_VP_STRIDED_LOAD(cast(N)); break; @@ -6163,6 +6214,29 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) { return Res; } +SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD_FF(VPLoadFFSDNode *N) { + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + SDValue EVL = N->getVectorLength(); + SDLoc dl(N); + + // The mask should be widened as well + assert(getTypeAction(Mask.getValueType()) == + TargetLowering::TypeWidenVector && + "Unable to widen binary VP op"); + Mask = GetWidenedVector(Mask); + assert(Mask.getValueType().getVectorElementCount() == + TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType()) + .getVectorElementCount() && + "Unable to widen vector load"); + + SDValue Res = DAG.getLoadFFVP(WidenVT, dl, N->getChain(), N->getBasePtr(), + Mask, EVL, N->getMemOperand()); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + ReplaceValueWith(SDValue(N, 2), Res.getValue(2)); + return Res; +} + SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) { SDLoc DL(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 5c586f73aa125..0cb9cbb7263c1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -837,6 +837,14 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { ID.AddInteger(ELD->getMemOperand()->getFlags()); break; } + case ISD::VP_LOAD_FF: { + const auto *LD = cast(N); + ID.AddInteger(LD->getMemoryVT().getRawBits()); + ID.AddInteger(LD->getRawSubclassData()); + ID.AddInteger(LD->getPointerInfo().getAddrSpace()); + ID.AddInteger(LD->getMemOperand()->getFlags()); + break; + } case ISD::VP_STORE: { const VPStoreSDNode *EST = cast(N); ID.AddInteger(EST->getMemoryVT().getRawBits()); @@ -10393,6 +10401,34 @@ SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT, return V; } +SDValue SelectionDAG::getLoadFFVP(EVT VT, const SDLoc &dl, SDValue Chain, + SDValue Ptr, SDValue Mask, SDValue EVL, + MachineMemOperand *MMO) { + SDVTList VTs = getVTList(VT, EVL.getValueType(), MVT::Other); + SDValue Ops[] = {Chain, Ptr, Mask, EVL}; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::VP_LOAD_FF, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(getSyntheticNodeSubclassData(dl.getIROrder(), + VTs, VT, MMO)); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + ID.AddInteger(MMO->getFlags()); + void *IP = nullptr; + if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, + VT, MMO); + createOperands(N, Ops); + + CSEMap.InsertNode(N, IP); + InsertNode(N); + SDValue V(N, 0); + NewSDValueDbgMsg(V, "Creating new node: ", this); + return V; +} + SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 306e068f1c1da..719ce43ce4d02 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8440,6 +8440,35 @@ void SelectionDAGBuilder::visitVPLoad( setValue(&VPIntrin, LD); } +void SelectionDAGBuilder::visitVPLoadFF( + const VPIntrinsic &VPIntrin, EVT VT, EVT EVLVT, + const SmallVectorImpl &OpValues) { + assert(OpValues.size() == 3); + SDLoc DL = getCurSDLoc(); + Value *PtrOperand = VPIntrin.getArgOperand(0); + MaybeAlign Alignment = VPIntrin.getPointerAlignment(); + AAMDNodes AAInfo = VPIntrin.getAAMetadata(); + const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range); + SDValue LD; + bool AddToChain = true; + // Do not serialize variable-length loads of constant memory with + // anything. + if (!Alignment) + Alignment = DAG.getEVTAlign(VT); + MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); + AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); + SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, + LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); + LD = DAG.getLoadFFVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], + MMO); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, EVLVT, LD.getValue(1)); + if (AddToChain) + PendingLoads.push_back(LD.getValue(2)); + setValue(&VPIntrin, DAG.getMergeValues({LD.getValue(0), Trunc}, DL)); +} + void SelectionDAGBuilder::visitVPGather( const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues) { @@ -8673,6 +8702,9 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic( case ISD::VP_LOAD: visitVPLoad(VPIntrin, ValueVTs[0], OpValues); break; + case ISD::VP_LOAD_FF: + visitVPLoadFF(VPIntrin, ValueVTs[0], ValueVTs[1], OpValues); + break; case ISD::VP_GATHER: visitVPGather(VPIntrin, ValueVTs[0], OpValues); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 1c278076a219d..c251755ee7064 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -631,6 +631,8 @@ class SelectionDAGBuilder { void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic); void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues); + void visitVPLoadFF(const VPIntrinsic &VPIntrin, EVT VT, EVT EVLVT, + const SmallVectorImpl &OpValues); void visitVPStore(const VPIntrinsic &VPIntrin, const SmallVectorImpl &OpValues); void visitVPGather(const VPIntrinsic &VPIntrin, EVT VT, diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index b1d3339c5a414..23a4d1b5c615e 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -448,6 +448,7 @@ VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) { case Intrinsic::experimental_vp_strided_store: return 1; case Intrinsic::vp_load: + case Intrinsic::vp_load_ff: case Intrinsic::vp_gather: case Intrinsic::experimental_vp_strided_load: return 0; @@ -671,6 +672,10 @@ Function *VPIntrinsic::getOrInsertDeclarationForParams( VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType()}); break; + case Intrinsic::vp_load_ff: + VPFunc = Intrinsic::getOrInsertDeclaration( + M, VPID, {ReturnType->getStructElementType(0), Params[0]->getType()}); + break; case Intrinsic::experimental_vp_strided_load: VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()}); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b47d89b42f533..30a50da5d0c5d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -927,6 +927,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::SCALAR_TO_VECTOR}, @@ -1105,6 +1106,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -1181,6 +1183,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FABS, VT, Expand); @@ -1352,6 +1355,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::ADD, ISD::MUL, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR, ISD::SDIV, ISD::SREM, ISD::UDIV, @@ -1442,6 +1446,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SCATTER, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, @@ -8096,6 +8101,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::MLOAD: case ISD::VP_LOAD: return lowerMaskedLoad(Op, DAG); + case ISD::VP_LOAD_FF: + return lowerLoadFF(Op, DAG); case ISD::MSTORE: case ISD::VP_STORE: return lowerMaskedStore(Op, DAG); @@ -12682,6 +12689,57 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, return DAG.getMergeValues({Result, Chain}, DL); } +SDValue RISCVTargetLowering::lowerLoadFF(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getResNo() == 0 && "Unexpected result number"); + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + + const auto *VPLoadFF = cast(Op); + EVT MemVT = VPLoadFF->getMemoryVT(); + MachineMemOperand *MMO = VPLoadFF->getMemOperand(); + SDValue Chain = VPLoadFF->getChain(); + SDValue BasePtr = VPLoadFF->getBasePtr(); + + SDValue Mask = VPLoadFF->getMask(); + SDValue VL = VPLoadFF->getVectorLength(); + + bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode()); + + MVT XLenVT = Subtarget.getXLenVT(); + + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + if (!IsUnmasked) { + MVT MaskVT = getMaskTypeFor(ContainerVT); + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + } + } + + unsigned IntID = + IsUnmasked ? Intrinsic::riscv_vleff : Intrinsic::riscv_vleff_mask; + SmallVector Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)}; + Ops.push_back(DAG.getUNDEF(ContainerVT)); + Ops.push_back(BasePtr); + if (!IsUnmasked) + Ops.push_back(Mask); + Ops.push_back(VL); + if (!IsUnmasked) + Ops.push_back(DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT)); + + SDVTList VTs = DAG.getVTList({ContainerVT, Op->getValueType(1), MVT::Other}); + + SDValue Result = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO); + SDValue OutVL = Result.getValue(1); + Chain = Result.getValue(2); + + if (VT.isFixedLengthVector()) + Result = convertFromScalableVector(VT, Result, DAG, Subtarget); + + return DAG.getMergeValues({Result, OutVL, Chain}, DL); +} + SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ca70c46988b4e..a4c353478a649 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -526,6 +526,7 @@ class RISCVTargetLowering : public TargetLowering { SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMaskedLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerLoadFF(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVectorCompress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99cf31899..a6860fdcbde42 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -110,6 +110,9 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool supportsScalableVectors() const override { return ST->hasVInstructions(); } + bool supportsSpeculativeLoads() const override { + return ST->hasVInstructions(); + } bool enableOrderedReductions() const override { return true; } bool enableScalableVectorization() const override { return ST->hasVInstructions(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 969d225c6ef2e..4382aff170043 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/SizeOpts.h" @@ -1769,16 +1770,41 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { assert(LatchBB->getUniquePredecessor() == SingleUncountableEdge->first && "Expected latch predecessor to be the early exiting block"); - // TODO: Handle loops that may fault. Predicates.clear(); - if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, - &Predicates)) { + SmallVector NonDerefLoads; + bool HasSafeAccess = + TTI->supportsSpeculativeLoads() + ? isReadOnlyLoopWithSafeOrSpeculativeLoads( + TheLoop, PSE.getSE(), DT, AC, &NonDerefLoads, &Predicates) + : isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, + &Predicates); + if (!HasSafeAccess) { reportVectorizationFailure( "Loop may fault", "Cannot vectorize potentially faulting early exit loop", "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); return false; } + // Speculative loads need to be unit-stride. + for (LoadInst *LI : NonDerefLoads) { + if (LI->getParent() != TheLoop->getHeader()) { + reportVectorizationFailure("Cannot vectorize predicated speculative load", + "SpeculativeLoadNeedsPredication", ORE, + TheLoop); + return false; + } + int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand()); + if (Stride != 1) { + reportVectorizationFailure("Loop contains non-unit-stride load", + "Cannot vectorize early exit loop with " + "speculative non-unit-stride load", + "SpeculativeNonUnitStrideLoadEarlyExitLoop", + ORE, TheLoop); + return false; + } + SpeculativeLoads.insert(LI); + LLVM_DEBUG(dbgs() << "LV: Found speculative load: " << *LI << "\n"); + } [[maybe_unused]] const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); @@ -1912,10 +1938,12 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { bool LoopVectorizationLegality::canFoldTailByMasking() const { // The only loops we can vectorize without a scalar epilogue, are loops with - // a bottom-test and a single exiting block. We'd have to handle the fact - // that not every instruction executes on the last iteration. This will - // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + // a bottom-test and a single exiting block or those with early exits. We'd + // have to handle the fact that not every instruction executes on the last + // iteration. This will require a lane mask which varies through the vector + // loop body. (TODO) + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() && + !hasUncountableEarlyExit()) { LLVM_DEBUG( dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n"); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fe93fcd28348a..50879b7ef14b3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -402,6 +402,10 @@ static cl::opt EnableEarlyExitVectorization( cl::desc( "Enable vectorization of early exit loops with uncountable exits.")); +static cl::opt EnableSpeculativeLoads( + "enable-speculative-load", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization of loops with speculative loads.")); + // Likelyhood of bypassing the vectorized loop because there are zero trips left // after prolog. See `emitIterationCountCheck`. static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; @@ -1376,6 +1380,9 @@ class LoopVectorizationCostModel { if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL && ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL) return; + // Do not override EVL styles for speculative loads. + if (!Legal->getSpeculativeLoads().empty()) + return; // Override EVL styles if needed. // FIXME: Investigate opportunity for fixed vector factor. bool EVLIsLegal = UserIC <= 1 && IsScalableVF && @@ -4229,6 +4236,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: case VPDef::VPInterleaveSC: + case VPDef::VPWidenFFLoadEVLSC: + case VPDef::VPWidenFFLoadSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: case VPDef::VPWidenStoreEVLSC: @@ -7756,6 +7765,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, Builder.insert(VectorPtr); Ptr = VectorPtr; } + if (Legal->getSpeculativeLoads().contains(I)) { + auto *Load = dyn_cast(I); + return new VPWidenFFLoadRecipe(*Load, Ptr, Mask, VPIRMetadata(*Load, LVer), + I->getDebugLoc()); + } + if (LoadInst *Load = dyn_cast(I)) return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, VPIRMetadata(*Load, LVer), I->getDebugLoc()); @@ -8677,10 +8692,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // count is >= increment and a multiple of the increment. bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None; if (!HasNUW) { - auto *IVInc = Plan->getVectorLoopRegion() - ->getExitingBasicBlock() - ->getTerminator() - ->getOperand(0); + auto *IVInc = Plan->getCanonicalIV()->getBackedgeValue(); assert(match(IVInc, m_VPInstruction( m_Specific(Plan->getCanonicalIV()), m_VPValue())) && "Did not find the canonical IV increment"); @@ -9985,6 +9997,26 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } + if (EnableSpeculativeLoads) { + // VPWidenFFLoadEVLRecipe is currently the only concrete recipe that + // generates speculative load intrinsics. Since it relies on the EVL + // transform, speculative loads are only supported when tail-folding with + // EVL is enabled. + if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL || + PreferPredicateOverEpilogue != + PreferPredicateTy::PredicateOrDontVectorize) { + reportVectorizationFailure("Auto-vectorization of loops with speculative " + "load is not enabled", + "SpeculativeLoadsDisabled", ORE, L); + return false; + } + } else if (!LVL.getSpeculativeLoads().empty()) { + reportVectorizationFailure("Auto-vectorization of loops with speculative " + "load is not enabled", + "SpeculativeLoadsDisabled", ORE, L); + return false; + } + // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a5de5933d5ff1..32745a72c67fb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -559,6 +559,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenFFLoadEVLSC: + case VPRecipeBase::VPWidenFFLoadSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -2980,6 +2982,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase, static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || + R->getVPDefID() == VPRecipeBase::VPWidenFFLoadSC || + R->getVPDefID() == VPRecipeBase::VPWidenFFLoadEVLSC || R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; } @@ -3061,6 +3065,72 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe, } }; +struct VPWidenFFLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { + VPWidenFFLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, + const VPIRMetadata &Metadata, DebugLoc DL) + : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadSC, Load, {Addr}, + /*Consecutive*/ true, /*Reverse*/ false, Metadata, + DL), + VPValue(this, &Load) { + setMask(Mask); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadSC); + + void execute(VPTransformState &State) override { + llvm_unreachable("cannot execute this recipe, should be replaced by " + "VPWidenFFLoadEVLRecipe"); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr(); + } +}; + +struct VPWidenFFLoadEVLRecipe final : public VPWidenMemoryRecipe, + public VPValue { + VPWidenFFLoadEVLRecipe(VPWidenFFLoadRecipe &L, VPValue &EVL, VPValue *Mask) + : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadEVLSC, L.getIngredient(), + {L.getAddr(), &EVL}, true, false, L, + L.getDebugLoc()), + VPValue(this, &getIngredient()) { + new VPValue(nullptr, this); // newVL + setMask(Mask); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadEVLSC); + + /// Return the EVL operand. + VPValue *getEVL() const { return getOperand(1); } + + /// Generate a wide load or gather. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + // Widened, consecutive loads operations only demand the first lane of + // their address. + return Op == getEVL() || Op == getAddr(); + } +}; + /// A recipe for widening load operations with vector-predication intrinsics, /// using the address to load from, the explicit vector length and an optional /// mask. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 16072f268a98c..b23bc9c35698f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -186,7 +186,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa(R)) && + assert((isa(R)) && "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 68e7c20a070f4..616a42dbb184d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -84,6 +84,8 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenIntOrFpInductionSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: + case VPWidenFFLoadEVLSC: + case VPWidenFFLoadSC: case VPWidenPHISC: case VPWidenSC: case VPWidenSelectSC: { @@ -107,6 +109,8 @@ bool VPRecipeBase::mayReadFromMemory() const { return cast(this)->opcodeMayReadOrWriteFromMemory(); case VPWidenLoadEVLSC: case VPWidenLoadSC: + case VPWidenFFLoadEVLSC: + case VPWidenFFLoadSC: return true; case VPReplicateSC: return cast(getVPSingleValue()->getUnderlyingValue()) @@ -184,6 +188,9 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPWidenFFLoadEVLSC: + case VPWidenFFLoadSC: + return true; case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: @@ -3155,6 +3162,55 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPWidenFFLoadEVLRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + + Value *EVL = State.get(getEVL(), VPLane(0)); + Value *Addr = State.get(getAddr(), true); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) + Mask = State.get(VPMask); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + CallInst *NewLI = + Builder.CreateIntrinsic(Intrinsic::vp_load_ff, {DataTy, Addr->getType()}, + {Addr, Mask, EVL}, nullptr, "vp.op.load.ff"); + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + applyMetadata(*NewLI); + Value *V = cast(Builder.CreateExtractValue(NewLI, 0)); + Value *VL = Builder.CreateExtractValue(NewLI, 1); + State.set(getVPValue(0), V); + State.set(getVPValue(1), VL, /*NeedsScalar=*/true); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenFFLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = fault-only-first-load "; + printOperands(O, SlotTracker); +} +#endif + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenFFLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << ", "; + getVPValue(1)->printAsOperand(O, SlotTracker); + O << " = vp.load.ff "; + printOperands(O, SlotTracker); +} +#endif + /// Use all-true mask for reverse rather than actual mask, as it avoids a /// dependence w/o affecting the result. static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 47a9ff09352cb..32bb136446bf4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2047,6 +2047,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( // Replace the original terminator with BranchOnCond. We have to invert the // mask here because a true condition means jumping to the exit block. auto *NotMask = Builder.createNot(ALM, DL); + using namespace VPlanPatternMatch; + if (VPValue *IsEarlyExitTaken = nullptr; match( + OriginalTerminator, m_BranchOnCond(m_BinaryOr( + m_VPValue(IsEarlyExitTaken), m_VPValue())))) { + auto *AnyExitTaken = + Builder.createNaryOp(Instruction::Or, {IsEarlyExitTaken, NotMask}); + OriginalTerminator->setOperand(0, AnyExitTaken); + return LaneMaskPhi; + } + Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL); OriginalTerminator->eraseFromParent(); return LaneMaskPhi; @@ -2139,8 +2149,7 @@ void VPlanTransforms::addActiveLaneMask( /// \p AllOneMask The vector mask parameter of vector-predication intrinsics. /// \p EVL The explicit vector length parameter of vector-predication /// intrinsics. -static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, - VPRecipeBase &CurRecipe, +static VPRecipeBase *optimizeMaskToEVL(VPlan &Plan, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &AllOneMask, VPValue &EVL) { using namespace llvm::VPlanPatternMatch; @@ -2148,9 +2157,13 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, assert(OrigMask && "Unmasked recipe when folding tail"); // HeaderMask will be handled using EVL. VPValue *Mask; - if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) + VPValue *HeaderMask; + if (match(OrigMask, m_LogicalAnd(m_VPValue(HeaderMask), m_VPValue(Mask))) && + vputils::isHeaderMask(HeaderMask, Plan)) return Mask; - return HeaderMask == OrigMask ? nullptr : OrigMask; + if (vputils::isHeaderMask(OrigMask, Plan)) + return nullptr; + return OrigMask; }; return TypeSwitch(&CurRecipe) @@ -2158,6 +2171,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *NewMask = GetNewMask(L->getMask()); return new VPWidenLoadEVLRecipe(*L, EVL, NewMask); }) + .Case([&](VPWidenFFLoadRecipe *L) { + VPValue *NewMask = GetNewMask(L->getMask()); + return new VPWidenFFLoadEVLRecipe(*L, EVL, NewMask); + }) .Case([&](VPWidenStoreRecipe *S) { VPValue *NewMask = GetNewMask(S->getMask()); return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); @@ -2172,8 +2189,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, // select(header_mask, LHS, RHS) // into vector predication merge. // vp.merge(all-true, LHS, RHS, EVL) - if (!match(VPI, m_Select(m_Specific(HeaderMask), m_VPValue(LHS), - m_VPValue(RHS)))) + VPValue *HeaderMask; + if (!match(VPI, m_Select(m_VPValue(HeaderMask), m_VPValue(LHS), + m_VPValue(RHS))) || + !vputils::isHeaderMask(HeaderMask, Plan)) return nullptr; // Use all true as the condition because this transformation is // limited to selects whose condition is a header mask. @@ -2185,7 +2204,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, } /// Replace recipes with their EVL variants. -static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { +static VPValue *transformRecipestoEVLRecipes(VPlan &Plan) { Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); VPTypeAnalysis TypeInfo(CanonicalIVType); LLVMContext &Ctx = CanonicalIVType->getContext(); @@ -2197,7 +2216,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { IsaPred) && "User of VF that we can't transform to EVL."); - Plan.getVF().replaceAllUsesWith(&EVL); // Defer erasing recipes till the end so that we don't invalidate the // VPTypeAnalysis cache. @@ -2205,6 +2223,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { // Create a scalar phi to track the previous EVL if fixed-order recurrence is // contained. + VPInstruction *PrevEVL = nullptr; bool ContainsFORs = any_of(Header->phis(), IsaPred); if (ContainsFORs) { @@ -2217,79 +2236,97 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { DebugLoc()); Builder.setInsertPoint(Header, Header->getFirstNonPhi()); - VPValue *PrevEVL = - Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl"); - - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( - vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) { - for (VPRecipeBase &R : *VPBB) { - using namespace VPlanPatternMatch; - VPValue *V1, *V2; - if (!match(&R, - m_VPInstruction( - m_VPValue(V1), m_VPValue(V2)))) - continue; + PrevEVL = Builder.createScalarPhi({MaxEVL}, DebugLoc(), "prev.evl"); + } + + ReversePostOrderTraversal> RPOT( + Plan.getEntry()); + VPValue *LastEVL = nullptr; + VPValue *VF = &Plan.getVF(); + + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + for (VPRecipeBase &CurRecipe : *VPBB) { + auto *VPI = dyn_cast(&CurRecipe); + if (VPI && (VPI->getOpcode() == VPInstruction::ExplicitVectorLength)) { + assert((LastEVL == nullptr) && "EVL should be set only once"); + LastEVL = VPI; + continue; + } + if (!LastEVL) + continue; + if (isa(&CurRecipe)) { + if (CurRecipe.getOperand(1) == VF) + CurRecipe.setOperand(1, LastEVL); + continue; + } + if (isa(&CurRecipe)) { + if (CurRecipe.getOperand(2) == VF) + CurRecipe.setOperand(2, LastEVL); + continue; + } + VPValue *V1, *V2; + using namespace VPlanPatternMatch; + if (match(&CurRecipe, + m_VPInstruction( + m_VPValue(V1), m_VPValue(V2)))) { VPValue *Imm = Plan.getOrAddLiveIn( ConstantInt::getSigned(Type::getInt32Ty(Ctx), -1)); VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe( Intrinsic::experimental_vp_splice, - {V1, V2, Imm, AllOneMask, PrevEVL, &EVL}, - TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc()); - VPSplice->insertBefore(&R); - R.getVPSingleValue()->replaceAllUsesWith(VPSplice); - ToErase.push_back(&R); + {V1, V2, Imm, AllOneMask, PrevEVL, LastEVL}, + TypeInfo.inferScalarType(CurRecipe.getVPSingleValue()), + CurRecipe.getDebugLoc()); + VPSplice->insertBefore(&CurRecipe); + CurRecipe.getVPSingleValue()->replaceAllUsesWith(VPSplice); + ToErase.push_back(&CurRecipe); + continue; } - } - } - - // Try to optimize header mask recipes away to their EVL variants. - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { - // TODO: Split optimizeMaskToEVL out and move into - // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in - // tryToBuildVPlanWithVPRecipes beforehand. - for (VPUser *U : collectUsersRecursively(HeaderMask)) { - auto *CurRecipe = cast(U); + // TODO: Split optimizeMaskToEVL out and move into + // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run + // in tryToBuildVPlanWithVPRecipes beforehand. VPRecipeBase *EVLRecipe = - optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL); + optimizeMaskToEVL(Plan, CurRecipe, TypeInfo, *AllOneMask, *LastEVL); if (!EVLRecipe) continue; [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); - assert(NumDefVal == CurRecipe->getNumDefinedValues() && - "New recipe must define the same number of values as the " - "original."); - assert( - NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); - EVLRecipe->insertBefore(CurRecipe); + // Check if the recipe updates EVL + if (isa(EVLRecipe)) { + VPValue *CurVPV = CurRecipe.getVPSingleValue(); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(0)); + LastEVL = EVLRecipe->getVPValue(1); + } else { + assert(NumDefVal == CurRecipe.getNumDefinedValues() && + "New recipe must define the same number of values as the " + "original."); + assert( + NumDefVal <= 1 && + "Only supports recipes with a single definition or without users."); + } + EVLRecipe->insertBefore(&CurRecipe); if (isa(EVLRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); + VPValue *CurVPV = CurRecipe.getVPSingleValue(); CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } - ToErase.push_back(CurRecipe); + ToErase.push_back(&CurRecipe); } - - // Replace header masks with a mask equivalent to predicating by EVL: - // - // icmp ule widen-canonical-iv backedge-taken-count - // -> - // icmp ult step-vector, EVL - VPRecipeBase *EVLR = EVL.getDefiningRecipe(); - VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator())); - Type *EVLType = TypeInfo.inferScalarType(&EVL); - VPValue *EVLMask = Builder.createICmp( - CmpInst::ICMP_ULT, - Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL); - HeaderMask->replaceAllUsesWith(EVLMask); - ToErase.push_back(HeaderMask->getDefiningRecipe()); } - + for (VPRecipeBase &CurRecipe : Header->phis()) { + if (isa(&CurRecipe)) { + if (CurRecipe.getOperand(2) == VF) + CurRecipe.setOperand(2, LastEVL); + continue; + } + } + if (PrevEVL) + PrevEVL->addOperand(LastEVL); for (VPRecipeBase *R : reverse(ToErase)) { SmallVector PossiblyDead(R->operands()); R->eraseFromParent(); for (VPValue *Op : PossiblyDead) recursivelyDeleteDeadRecipes(Op); } + return LastEVL; } /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and @@ -2360,13 +2397,13 @@ bool VPlanTransforms::tryAddExplicitVectorLength( VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl"); } - auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, - DebugLoc()); + Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, DebugLoc()); + + VPValue *OpVPEVL = transformRecipestoEVLRecipes(Plan); auto *CanonicalIVIncrement = cast(CanonicalIVPHI->getBackedgeValue()); Builder.setInsertPoint(CanonicalIVIncrement); - VPValue *OpVPEVL = VPEVL; auto *I32Ty = Type::getInt32Ty(CanIVTy->getContext()); OpVPEVL = Builder.createScalarZExtOrTrunc( @@ -2379,8 +2416,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength( CanonicalIVIncrement->getDebugLoc(), "index.evl.next"); EVLPhi->addOperand(NextEVLIV); - transformRecipestoEVLRecipes(Plan, *VPEVL); - // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); @@ -2442,6 +2477,21 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) { // Skip single-iteration loop region if (match(LatchExitingBr, m_BranchOnCond(m_True()))) return; + + // Replace VectorTripCount used in loop with early-exits + if (VPValue *VPMainExitCond = nullptr; + match(LatchExitingBr, m_BranchOnCond(m_BinaryOr( + m_VPValue(), m_VPValue(VPMainExitCond)))) && + match(VPMainExitCond, m_VPInstruction( + m_Specific(EVLIncrement), + m_Specific(&Plan.getVectorTripCount())))) { + // Expected pattern here is: + // EMIT vp<%main.exit.cond> = icmp eq vp<%evl.next>, vp<%vtc> + // EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond> + // EMIT branch-on-cond vp<%exit.cond> + VPMainExitCond->getDefiningRecipe()->setOperand(1, Plan.getTripCount()); + return; + } assert(LatchExitingBr && match(LatchExitingBr, m_BranchOnCount(m_VPValue(EVLIncrement), diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 24f6d61512ef6..179e557731ce9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -40,6 +40,7 @@ class VPUser; class VPRecipeBase; class VPInterleaveRecipe; class VPPhiAccessors; +class VPWidenFFLoadEVLRecipe; // This is the base class of the VPlan Def/Use graph, used for modeling the data // flow into, within and out of the VPlan. VPValues can stand for live-ins @@ -51,6 +52,7 @@ class LLVM_ABI_FOR_TEST VPValue { friend class VPInterleaveRecipe; friend class VPlan; friend class VPExpressionRecipe; + friend class VPWidenFFLoadEVLRecipe; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -348,6 +350,8 @@ class VPDef { VPWidenCastSC, VPWidenGEPSC, VPWidenIntrinsicSC, + VPWidenFFLoadSC, + VPWidenFFLoadEVLSC, VPWidenLoadEVLSC, VPWidenLoadSC, VPWidenStoreEVLSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 57d01cbefbe26..252754724caa2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -167,7 +167,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { } return VerifyEVLUse(*R, 2); }) - .Case( + .Case( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) .Case( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) @@ -175,6 +176,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { if (I->getOpcode() == Instruction::PHI || I->getOpcode() == Instruction::ICmp) return VerifyEVLUse(*I, 1); + if (I->getOpcode() == Instruction::Sub) + return VerifyEVLUse(*I, 0); switch (I->getOpcode()) { case Instruction::Add: break; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll new file mode 100644 index 0000000000000..5b01976dbbebd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll @@ -0,0 +1,586 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +define { <2 x i8>, i32 } @vploadff_v2i8(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i8>, i32 } %load +} + +define { <2 x i8>, i32 } @vploadff_v2i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i8>, i32 } %load +} + +define { <4 x i8>, i32 } @vploadff_v4i8(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i8>, i32 } %load +} + +define { <4 x i8>, i32 } @vploadff_v4i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i8>, i32 } %load +} + +define { <8 x i8>, i32 } @vploadff_v8i8(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i8>, i32 } %load +} + +define { <8 x i8>, i32 } @vploadff_v8i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i8>, i32 } %load +} + +define { <2 x i16>, i32 } @vploadff_v2i16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i16>, i32 } @llvm.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i16>, i32 } %load +} + +define { <2 x i16>, i32 } @vploadff_v2i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i16>, i32 } @llvm.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i16>, i32 } %load +} + +define { <4 x i16>, i32 } @vploadff_v4i16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i16>, i32 } @llvm.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i16>, i32 } %load +} + +define { <4 x i16>, i32 } @vploadff_v4i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i16>, i32 } @llvm.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i16>, i32 } %load +} + +define { <8 x i16>, i32 } @vploadff_v8i16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i16>, i32 } @llvm.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i16>, i32 } %load +} + +define { <8 x i16>, i32 } @vploadff_v8i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i16>, i32 } @llvm.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i16>, i32 } %load +} + +define { <2 x i32>, i32 } @vploadff_v2i32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i32>, i32 } @llvm.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i32>, i32 } %load +} + +define { <2 x i32>, i32 } @vploadff_v2i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i32>, i32 } @llvm.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i32>, i32 } %load +} + +define { <4 x i32>, i32 } @vploadff_v4i32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i32>, i32 } @llvm.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i32>, i32 } %load +} + +define { <4 x i32>, i32 } @vploadff_v4i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i32>, i32 } @llvm.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i32>, i32 } %load +} + +define { <8 x i32>, i32 } @vploadff_v8i32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i32>, i32 } @llvm.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i32>, i32 } %load +} + +define { <8 x i32>, i32 } @vploadff_v8i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i32>, i32 } @llvm.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i32>, i32 } %load +} + +define { <2 x i64>, i32 } @vploadff_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i64>, i32 } %load +} + +define { <2 x i64>, i32 } @vploadff_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i64>, i32 } %load +} + +define { <4 x i64>, i32 } @vploadff_v4i64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i64>, i32 } %load +} + +define { <4 x i64>, i32 } @vploadff_v4i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i64>, i32 } %load +} + +define { <8 x i64>, i32 } @vploadff_v8i64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i64>, i32 } %load +} + +define { <8 x i64>, i32 } @vploadff_v8i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i64>, i32 } %load +} + +define { <32 x i64>, i32 } @vploadff_v32i64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: bltu a2, a3, .LBB24_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: .LBB24_2: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a1), v0.t +; CHECK-NEXT: csrr a1, vl +; CHECK-NEXT: sw a1, 256(a0) +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret + %load = call { <32 x i64>, i32 } @llvm.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> %m, i32 %evl) + ret { <32 x i64>, i32 } %load +} + +define { <32 x i64>, i32 } @vploadff_v32i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v32i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: bltu a2, a3, .LBB25_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a1) +; CHECK-NEXT: csrr a1, vl +; CHECK-NEXT: sw a1, 256(a0) +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret + %load = call { <32 x i64>, i32 } @llvm.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> splat (i1 true), i32 %evl) + ret { <32 x i64>, i32 } %load +} + +define { <2 x half>, i32 } @vploadff_v2f16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x half>, i32 } @llvm.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x half>, i32 } %load +} + +define { <2 x half>, i32 } @vploadff_v2f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x half>, i32 } @llvm.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x half>, i32 } %load +} + +define { <4 x half>, i32 } @vploadff_v4f16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x half>, i32 } @llvm.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x half>, i32 } %load +} + +define { <4 x half>, i32 } @vploadff_v4f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x half>, i32 } @llvm.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x half>, i32 } %load +} + +define { <8 x half>, i32 } @vploadff_v8f16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x half>, i32 } @llvm.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x half>, i32 } %load +} + +define { <8 x half>, i32 } @vploadff_v8f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x half>, i32 } @llvm.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x half>, i32 } %load +} + +define { <2 x float>, i32 } @vploadff_v2f32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x float>, i32 } @llvm.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x float>, i32 } %load +} + +define { <2 x float>, i32 } @vploadff_v2f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x float>, i32 } @llvm.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x float>, i32 } %load +} + +define { <4 x float>, i32 } @vploadff_v4f32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x float>, i32 } @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x float>, i32 } %load +} + +define { <4 x float>, i32 } @vploadff_v4f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x float>, i32 } @llvm.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x float>, i32 } %load +} + +define { <8 x float>, i32 } @vploadff_v8f32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x float>, i32 } @llvm.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x float>, i32 } %load +} + +define { <8 x float>, i32 } @vploadff_v8f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x float>, i32 } @llvm.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x float>, i32 } %load +} + +define { <2 x double>, i32 } @vploadff_v2f64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x double>, i32 } @llvm.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x double>, i32 } %load +} + +define { <2 x double>, i32 } @vploadff_v2f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x double>, i32 } @llvm.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x double>, i32 } %load +} + +define { <4 x double>, i32 } @vploadff_v4f64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x double>, i32 } @llvm.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x double>, i32 } %load +} + +define { <4 x double>, i32 } @vploadff_v4f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x double>, i32 } @llvm.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x double>, i32 } %load +} + +define { <8 x double>, i32 } @vploadff_v8f64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x double>, i32 } @llvm.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x double>, i32 } %load +} + +define { <8 x double>, i32 } @vploadff_v8f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x double>, i32 } @llvm.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x double>, i32 } %load +} + +define { <2 x bfloat>, i32 } @vploadff_v2bf16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x bfloat>, i32 } @llvm.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x bfloat>, i32 } %load +} + +define { <2 x bfloat>, i32 } @vploadff_v2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x bfloat>, i32 } @llvm.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x bfloat>, i32 } %load +} + +define { <4 x bfloat>, i32 } @vploadff_v4bf16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x bfloat>, i32 } @llvm.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x bfloat>, i32 } %load +} + +define { <4 x bfloat>, i32 } @vploadff_v4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x bfloat>, i32 } @llvm.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x bfloat>, i32 } %load +} + +define { <8 x bfloat>, i32 } @vploadff_v8bf16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x bfloat>, i32 } @llvm.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x bfloat>, i32 } %load +} + +define { <8 x bfloat>, i32 } @vploadff_v8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x bfloat>, i32 } @llvm.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x bfloat>, i32 } %load +} + +define { <7 x i8>, i32 } @vploadff_v7i8(ptr %ptr, <7 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v7i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <7 x i8>, i32 } @llvm.vp.load.ff.v7i8.p0(ptr %ptr, <7 x i1> %m, i32 %evl) + ret { <7 x i8>, i32 } %load +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll new file mode 100644 index 0000000000000..9e08938a9fe6c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll @@ -0,0 +1,1008 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +define { , i32 } @vploadff_nxv1i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv64i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv64i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv64i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv64i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv64i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define @vploadff_nxv128i8(ptr %ptr, ptr %evl_out, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: bltu a2, a3, .LBB14_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv128i8.p0(ptr %ptr, %m, i32 %evl) + %result0 = extractvalue { , i32 } %load, 0 + %result1 = extractvalue { , i32 } %load, 1 + store i32 %result1, ptr %evl_out + ret %result0 +} + +define @vploadff_nxv128i8_allones_mask(ptr %ptr, ptr %evl_out, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv128i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: bltu a2, a3, .LBB15_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv128i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + %result0 = extractvalue { , i32 } %load, 0 + %result1 = extractvalue { , i32 } %load, 1 + store i32 %result1, ptr %evl_out + ret %result0 +} + +define { , i32 } @vploadff_nxv1i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv1bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv2bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv4bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv8bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv16bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv32bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv3i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.vp.load.ff.nxv3i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll new file mode 100644 index 0000000000000..3e77436fbede9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -enable-speculative-load -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s + +define ptr @find_with_liveout(ptr %first, ptr %last, ptr %value) { +; CHECK-LABEL: define ptr @find_with_liveout( +; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]] +; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT:.*]], label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]] +; CHECK: [[RETURN_LOOPEXIT]]: +; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi ptr [ [[FIRST_ADDR_07]], %[[FOR_BODY]] ], [ [[LAST]], %[[FOR_INC]] ] +; CHECK-NEXT: br label %[[RETURN]] +; CHECK: [[RETURN]]: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ] +; CHECK-NEXT: ret ptr [[RETVAL_0]] +; +entry: + %cmp.not6 = icmp eq ptr %first, %last + br i1 %cmp.not6, label %return, label %for.body.lr.ph + +for.body.lr.ph: + %0 = load i32, ptr %value, align 4 + br label %for.body + +for.body: + %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ] + %1 = load i32, ptr %first.addr.07, align 4 + %cmp1 = icmp eq i32 %1, %0 + br i1 %cmp1, label %return.loopexit, label %for.inc + +for.inc: + %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1 + %cmp.not = icmp eq ptr %incdec.ptr, %last + br i1 %cmp.not, label %return.loopexit, label %for.body + +return.loopexit: + %retval.0.ph = phi ptr [ %first.addr.07, %for.body ], [ %last, %for.inc ] + br label %return + +return: + %retval.0 = phi ptr [ %first, %entry ], [ %retval.0.ph, %return.loopexit ] + ret ptr %retval.0 +} + +define i32 @find_without_liveout(ptr %first, ptr %last, ptr %value) { +; CHECK-LABEL: define i32 @find_without_liveout( +; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[FIRST4:%.*]] = ptrtoint ptr [[FIRST]] to i64 +; CHECK-NEXT: [[LAST3:%.*]] = ptrtoint ptr [[LAST]] to i64 +; CHECK-NEXT: [[FIRST2:%.*]] = ptrtoint ptr [[FIRST]] to i64 +; CHECK-NEXT: [[LAST1:%.*]] = ptrtoint ptr [[LAST]] to i64 +; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]] +; CHECK-NEXT: br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]] +; CHECK: [[FOR_BODY_LR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[LAST3]], -4 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[FIRST4]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[LAST1]] to i2 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[FIRST2]] to i2 +; CHECK-NEXT: [[TMP7:%.*]] = sub i2 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = zext i2 [[TMP7]] to i64 +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP4]], [[TMP11]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[TMP4]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 4 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[VP_OP_LOAD_FF:%.*]] = call { , i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[NEXT_GEP]], splat (i1 true), i32 [[TMP14]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , i32 } [[VP_OP_LOAD_FF]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , i32 } [[VP_OP_LOAD_FF]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq [[TMP16]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP18]]) +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = or i1 [[TMP20]], [[TMP21]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP20]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[RETURN_LOOPEXIT:.*]] +; CHECK: [[VECTOR_EARLY_EXIT]]: +; CHECK-NEXT: br label %[[RETURN_LOOPEXIT]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[FIRST]], %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[FIRST_ADDR_07:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP23]], [[TMP0]] +; CHECK-NEXT: br i1 [[CMP1]], label %[[RETURN_LOOPEXIT]], label %[[FOR_INC]] +; CHECK: [[FOR_INC]]: +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[RETURN_LOOPEXIT]]: +; CHECK-NEXT: [[RETVAL_0_PH:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ 1, %[[FOR_INC]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ] +; CHECK-NEXT: br label %[[RETURN]] +; CHECK: [[RETURN]]: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RETVAL_0]] +; +entry: + %cmp.not6 = icmp eq ptr %first, %last + br i1 %cmp.not6, label %return, label %for.body.lr.ph + +for.body.lr.ph: + %0 = load i32, ptr %value, align 4 + br label %for.body + +for.body: + %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ] + %1 = load i32, ptr %first.addr.07, align 4 + %cmp1 = icmp eq i32 %1, %0 + br i1 %cmp1, label %return.loopexit, label %for.inc + +for.inc: + %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1 + %cmp.not = icmp eq ptr %incdec.ptr, %last + br i1 %cmp.not, label %return.loopexit, label %for.body + +return.loopexit: + %retval.0.ph = phi i32 [ 0, %for.body ], [ 1, %for.inc ] + br label %return + +return: + %retval.0 = phi i32 [ 0, %entry ], [ %retval.0.ph, %return.loopexit ] + ret i32 %retval.0 +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index b82b7f3fb33b4..81ff3ba45e1fa 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -34,13 +34,10 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP11]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[TMP20]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP13:%.*]] = icmp ult [[TMP19]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IND]], splat (i32 8) ; CHECK-NEXT: [[TMP14:%.*]] = icmp sge [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP13]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP7]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll index 48e080c93f0b5..4d93868bbd22c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll @@ -250,19 +250,24 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 ; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ult [[TMP12]], [[BROADCAST_SPLAT]] +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP13]] +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] ; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], splat (i1 true), i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sle [[VP_OP_LOAD]], splat (i32 3) @@ -757,26 +762,34 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 ; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = mul [[TMP15]], splat (i64 1) +; IF-EVL-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP16]] ; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i32() ; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = mul [[TMP10]], splat (i32 1) ; IF-EVL-OUTLOOP-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP13]] ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-EVL-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND2:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP14]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP14]] ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ult [[TMP12]], [[BROADCAST_SPLAT4]] +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = zext i32 [[TMP14]] to i64 +; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP12]] +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], splat (i1 true), i32 [[TMP14]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp sle [[VP_OP_LOAD]], [[VEC_IND2]] @@ -786,6 +799,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP24]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[PREDPHI]], [[VEC_PHI]], i32 [[TMP14]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP25:%.*]] = zext i32 [[TMP14]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[IV]] +; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT7]] = add [[VEC_IND2]], [[BROADCAST_SPLAT2]] ; IF-EVL-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index 2678989731634..236cc92f7a0c0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -32,8 +32,8 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], [[VEC_IND]], i32 0 ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP21]], splat (i1 true), i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[VEC_IND]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 41b96365af59d..3350f40105608 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -412,14 +412,11 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] ; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv4i32() -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT4]] +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule [[VEC_IND]], splat (i64 1024) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer ; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i64.nxv4p0( align 8 [[BROADCAST_SPLAT]], [[TMP10]], i32 [[TMP7]]) @@ -867,8 +864,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; TF-SCALABLE-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP13]] +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = zext i32 [[TMP9]] to i64 +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP12]] ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[VEC_IND]], align 8 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP9]]) @@ -877,8 +874,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 ; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1025 +; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: ; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll new file mode 100644 index 0000000000000..06d0a4f146b11 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -S -enable-early-exit-vectorization %s | FileCheck %s + +declare void @init(ptr) + +define i64 @multi_exiting_to_different_exits_live_in_exit_values() { +; CHECK-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SRC:%.*]] = alloca [128 x i32], align 4 +; CHECK-NEXT: call void @init(ptr [[SRC]]) +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 128, [[TMP2]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 128, [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[GEP_SRC]], splat (i1 true), i32 [[TMP5]]) +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq [[VP_OP_LOAD]], splat (i32 10) +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP12]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 128 +; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_SPLIT]]: +; CHECK-NEXT: br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[E2:.*]] +; CHECK: [[VECTOR_EARLY_EXIT]]: +; CHECK-NEXT: br label %[[E1:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC1]], align 4 +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[L]], 10 +; CHECK-NEXT: br i1 [[C_1]], label %[[E1]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[INC]] = add nuw i64 [[IV1]], 1 +; CHECK-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128 +; CHECK-NEXT: br i1 [[C_2]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[E1]]: +; CHECK-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ] +; CHECK-NEXT: ret i64 [[P1]] +; CHECK: [[E2]]: +; CHECK-NEXT: [[P2:%.*]] = phi i64 [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[P2]] +; +entry: + %src = alloca [128 x i32] + call void @init(ptr %src) + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv + %l = load i32, ptr %gep.src + %c.1 = icmp eq i32 %l, 10 + br i1 %c.1, label %e1, label %loop.latch + +loop.latch: + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e2, label %loop.header + +e1: + %p1 = phi i64 [ 0, %loop.header ] + ret i64 %p1 + +e2: + %p2 = phi i64 [ 1, %loop.latch ] + ret i64 %p2 +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} +; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]} +;. diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp index d6ad7599ce461..a101979ee6a4a 100644 --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -100,6 +100,8 @@ class VPIntrinsicTest : public testing::Test { "i32*>, <8 x i1>, i32) "; Str << " declare <8 x i32> @llvm.vp.load.v8i32.p0v8i32(<8 x i32>*, <8 x " "i1>, i32) "; + Str << " declare {<8 x i32>, i32} @llvm.vp.load.ff.v8i32.p0v8i32(<8 x " + "i32>*, <8 x i1>, i32) "; Str << "declare <8 x i32> " "@llvm.experimental.vp.strided.load.v8i32.i32(i32*, i32, <8 " "x i1>, i32) ";