From eb2d7aa90afdb5fc6f54a4fd9b4828c4050cc7ce Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Jul 2025 19:34:54 +0000 Subject: [PATCH 1/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 146 +++++++++++++++++- .../RISCV/reordered-buildvector-scalars.ll | 102 ++++++------ 2 files changed, 197 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6ad5c60105a28..14d4e45d1ab6f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1906,6 +1906,7 @@ class BoUpSLP { void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntries.clear(); + PostponedNodesWithNonVecUsers.clear(); OperandsToTreeEntry.clear(); ScalarsInSplitNodes.clear(); MustGather.clear(); @@ -3896,6 +3897,9 @@ class BoUpSLP { bool hasState() const { return S.valid(); } + /// Returns the state of the operations. + const InstructionsState &getOperations() const { return S; } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. unsigned findLaneForValue(Value *V) const { @@ -4290,6 +4294,13 @@ class BoUpSLP { OrdersType &CurrentOrder, SmallVectorImpl &PointerOps); + /// Checks if it is profitable to vectorize the specified list of the + /// instructions if not all users are vectorized. + bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S, + const EdgeInfo &UserTreeIdx, + ArrayRef Scalars, + ArrayRef ScalarsMask); + /// Maps a specific scalar to its tree entry(ies). SmallDenseMap> ScalarToTreeEntries; @@ -4300,6 +4311,9 @@ class BoUpSLP { /// Scalars, used in split vectorize nodes. SmallDenseMap> ScalarsInSplitNodes; + /// List of tree nodes indices, which have non-vectorized users. + SmallSet PostponedNodesWithNonVecUsers; + /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; @@ -8993,6 +9007,81 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } +bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers( + const InstructionsState &S, const EdgeInfo &UserTreeIdx, + ArrayRef Scalars, ArrayRef ScalarsMask) { + assert(S && "Expected valid instructions state."); + // Loads, extracts and geps are immediately scalarizable, so no need to check. + if (S.getOpcode() == Instruction::Load || + S.getOpcode() == Instruction::ExtractElement || + S.getOpcode() == Instruction::GetElementPtr) + return true; + // Check only vectorized users, others scalarized (potentially, at least) + // already. + if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() || + UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize) + return true; + // PHI nodes may have cyclic deps, so cannot check here. + if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI) + return true; + // Do not check root reduction nodes, they do not have non-vectorized users. + if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0) + return true; + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + ArrayRef VL = UserTreeIdx.UserTE->Scalars; + Type *UserScalarTy = getValueType(VL.front()); + if (!isValidElementType(UserScalarTy)) + return true; + Type *ScalarTy = getValueType(Scalars.front()); + if (!isValidElementType(ScalarTy)) + return true; + // Ignore subvectors extracts. + if (UserScalarTy->isVectorTy()) + return true; + auto *UserVecTy = + getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor()); + APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor()); + // Check the external uses and check, if vector node + extracts is not + // profitable for the vectorization. + InstructionCost UserScalarsCost = 0; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (areAllUsersVectorized(I, UserIgnoreList)) + continue; + DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V)); + UserScalarsCost += TTI->getInstructionCost(I, CostKind); + } + // No non-vectorized users - success. + if (DemandedElts.isZero()) + return true; + // If extracts are cheaper than the original scalars - success. + InstructionCost ExtractCost = + ::getScalarizationOverhead(*TTI, UserScalarTy, UserVecTy, DemandedElts, + /*Insert=*/false, /*Extract=*/true, CostKind); + if (ExtractCost <= UserScalarsCost) + return true; + SmallPtrSet CheckedExtracts; + InstructionCost NodeCost = + UserTreeIdx.UserTE->State == TreeEntry::CombinedVectorize + ? InstructionCost(0) + : getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts); + // The node is profitable for vectorization - success. + if (ExtractCost + NodeCost <= -SLPCostThreshold) + return true; + auto *VecTy = getWidenedType(ScalarTy, Scalars.size()); + InstructionCost ScalarsCost = ::getScalarizationOverhead( + *TTI, ScalarTy, VecTy, APInt::getAllOnes(Scalars.size()), + /*Insert=*/true, /*Extract=*/false, CostKind); + if (!ScalarsMask.empty()) + ScalarsCost += getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy, + ScalarsMask, CostKind); + + // User extracts are cheaper than user scalars + immediate scalars - success. + return ExtractCost - (UserScalarsCost + ScalarsCost) < -SLPCostThreshold; +} + BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( const InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, @@ -10283,6 +10372,17 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, return; } + // Postpone vectorization, if the node is not profitable because of the + // external uses. + if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL, + ReuseShuffleIndices)) { + PostponedNodesWithNonVecUsers.insert(VectorizableTree.size()); + auto Invalid = ScheduleBundle::invalid(); + newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); + return; + } + Instruction *VL0 = S.getMainOp(); BasicBlock *BB = VL0->getParent(); auto &BSRef = BlocksSchedules[BB]; @@ -11668,6 +11768,27 @@ void BoUpSLP::transformNodes() { ArrayRef VL = E.Scalars; const unsigned Sz = getVectorElementSize(VL.front()); unsigned MinVF = getMinVF(2 * Sz); + const EdgeInfo &EI = E.UserTreeIndex; + // Try to vectorized postponed scalars, if external uses are vectorized. + if (PostponedNodesWithNonVecUsers.contains(E.Idx) && + isProfitableToVectorizeWithNonVecUsers( + E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) { + assert(E.hasState() && "Expected to have state"); + unsigned PrevSize = VectorizableTree.size(); + [[maybe_unused]] unsigned PrevEntriesSize = + LoadEntriesToVectorize.size(); + buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX)); + if (PrevSize + 1 == VectorizableTree.size() && + VectorizableTree[PrevSize]->isGather()) { + VectorizableTree.pop_back(); + assert(PrevEntriesSize == LoadEntriesToVectorize.size() && + "LoadEntriesToVectorize expected to remain the same"); + } else { + E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0); + continue; + } + } + // Do not try partial vectorization for small nodes (<= 2), nodes with the // same opcode and same parent block or all constants. if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || @@ -12828,7 +12949,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }); InVectors.front() = V; } - if (!SubVectors.empty()) { + if (!SubVectors.empty() && + (SubVectors.size() > 1 || SubVectors.front().second != 0 || + SubVectors.front().first->getVectorFactor() != CommonMask.size())) { const PointerUnion &Vec = InVectors.front(); if (InVectors.size() == 2) Cost += createShuffle(Vec, InVectors.back(), CommonMask); @@ -13348,7 +13471,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); + auto SrcIt = + MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0)); Type *SrcScalarTy = VL0->getOperand(0)->getType(); auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size()); unsigned Opcode = ShuffleOrOp; @@ -13795,7 +13919,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType(); auto *SrcTy = getWidenedType(SrcSclTy, VL.size()); if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) { - auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); + auto SrcIt = MinBWs.empty() ? MinBWs.end() + : MinBWs.find(getOperandEntry(E, 0)); unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); unsigned SrcBWSz = DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType()); @@ -14793,6 +14918,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, auto *Inst = cast(EU.Scalar); InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind); auto OperandIsScalar = [&](Value *V) { + if (!isa(V)) + return true; if (!isVectorized(V)) { // Some extractelements might be not vectorized, but // transformed into shuffle and removed from the function, @@ -16873,7 +17000,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { }); InVectors.front() = Vec; } - if (!SubVectors.empty()) { + if (SubVectors.size() == 1 && SubVectors.front().second == 0 && + SubVectors.front().first->getVectorFactor() == CommonMask.size()) { + Value *Vec = SubVectors.front().first->VectorizedValue; + if (Vec->getType()->isIntOrIntVectorTy()) + Vec = castToScalarTyElem( + Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) { + if (isa(V)) + return false; + return !isKnownNonNegative(V, SimplifyQuery(*R.DL)); + })); + transformMaskAfterShuffle(CommonMask, CommonMask); + } else if (!SubVectors.empty()) { Value *Vec = InVectors.front(); if (InVectors.size() == 2) { Vec = createShuffle(Vec, InVectors.back(), CommonMask); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll index d4e323819402c..117c892b79f9a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll @@ -102,80 +102,88 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72 ; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1 ; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1 ; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]] -; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1 -; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16 ; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2 ; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2 -; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16 ; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1 ; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16 -; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16 +; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 +; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) +; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) ; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1 ; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1 ; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1 -; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8 -; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]] -; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1 -; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4 +; THRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0 +; THRESH-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1 +; THRESH-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; THRESH-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> poison, <2 x i32> zeroinitializer +; THRESH-NEXT: [[TMP22:%.*]] = or <2 x i32> [[TMP19]], [[TMP21]] ; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1 ; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1 -; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8 ; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1 ; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]] ; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1 -; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8 -; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1 -; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1 -; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8 +; THRESH-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1 +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[SHR143_5_I_I9]], i32 0 +; THRESH-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], splat (i32 1) +; THRESH-NEXT: [[TMP14:%.*]] = or <2 x i32> [[TMP12]], splat (i32 1) +; THRESH-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> ; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1 ; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]] ; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16 ; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8 -; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> -; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 -; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) -; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> -; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 ; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1 ; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2 +; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[SHR2303_I]], i32 2 +; THRESH-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[SHR2175_I]], i32 3 +; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16 +; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 ; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> ; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0 ; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1) ; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1) +; THRESH-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP23]], <8 x i32> ; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16> -; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 +; THRESH-NEXT: [[TMP26:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP27:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16> +; THRESH-NEXT: store <2 x i16> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 ; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4 -; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 ; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2 ; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2 +; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 +; THRESH-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> poison, i32 [[ADD2394_I]], i32 0 +; THRESH-NEXT: [[TMP29:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[SHR2325_I]], i32 1 +; THRESH-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[ADD2191_I]], i32 2 +; THRESH-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[SHR2303_I]], i32 3 +; THRESH-NEXT: [[TMP32:%.*]] = trunc <4 x i32> [[TMP31]] to <4 x i16> +; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16 +; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16 +; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 +; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 +; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 ; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2 -; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2 -; THRESH-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2 +; THRESH-NEXT: store <4 x i16> [[TMP32]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2 +; THRESH-NEXT: [[TMP33:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> +; THRESH-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 3 +; THRESH-NEXT: [[TMP37:%.*]] = lshr <4 x i32> [[TMP35]], [[TMP36]] +; THRESH-NEXT: [[TMP38:%.*]] = add <4 x i32> [[TMP35]], [[TMP36]] +; THRESH-NEXT: [[TMP39:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> [[TMP38]], <4 x i32> +; THRESH-NEXT: [[TMP40:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP41:%.*]] = shufflevector <8 x i32> [[TMP24]], <8 x i32> [[TMP40]], <8 x i32> +; THRESH-NEXT: [[TMP42:%.*]] = trunc <8 x i32> [[TMP41]] to <8 x i16> +; THRESH-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP39]], <4 x i32> +; THRESH-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[SHR2237_I]], i32 3 +; THRESH-NEXT: [[TMP45:%.*]] = trunc <4 x i32> [[TMP44]] to <4 x i16> +; THRESH-NEXT: [[TMP46:%.*]] = trunc <4 x i32> [[TMP39]] to <4 x i16> +; THRESH-NEXT: store <4 x i16> [[TMP45]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 +; THRESH-NEXT: store <8 x i16> [[TMP42]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 +; THRESH-NEXT: store <4 x i16> [[TMP46]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 ; THRESH-NEXT: ret i32 0 ; entry: From 708364152fafdce8d6a19f56b14ef2504208c841 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 28 Jul 2025 14:06:49 +0000 Subject: [PATCH 2/3] Address comments Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 73 +++++++++++-------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 93bdbee9454ab..c777379f9d244 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9165,6 +9165,33 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } +/// Check if extracts are cheaper than the original scalars. +static bool areExtractsCheaperThanScalars( + TargetTransformInfo &TTI, Type *UserScalarTy, VectorType *UserVecTy, + const APInt &DemandedElts, const InstructionCost UserScalarsCost, + Type *ScalarTy, unsigned VF, ArrayRef Mask, + const llvm::function_ref GetUserEntryCost) { + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + // If extracts are cheaper than the original scalars - success. + InstructionCost ExtractCost = + ::getScalarizationOverhead(TTI, UserScalarTy, UserVecTy, DemandedElts, + /*Insert=*/false, /*Extract=*/true, CostKind); + if (ExtractCost <= UserScalarsCost) + return true; + InstructionCost NodeCost = GetUserEntryCost(); + // The node is profitable for vectorization - success. + if (ExtractCost <= NodeCost) + return true; + auto *VecTy = getWidenedType(ScalarTy, VF); + InstructionCost ScalarsCost = + ::getScalarizationOverhead(TTI, ScalarTy, VecTy, APInt::getAllOnes(VF), + /*Insert=*/true, /*Extract=*/false, CostKind); + if (!Mask.empty()) + ScalarsCost += + getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); + return ExtractCost < UserScalarsCost + ScalarsCost; +} + bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers( const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef VL, ArrayRef Mask) { @@ -9193,7 +9220,8 @@ bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers( Type *ScalarTy = getValueType(VL.front()); if (!isValidElementType(ScalarTy)) return true; - // Ignore subvectors extracts for revectorized nodes. + // Ignore subvectors extracts for revectorized nodes, subvector extracts are + // always cheap as they do not require vector-to-scalar move. if (UserScalarTy->isVectorTy()) return true; auto *UserVecTy = @@ -9215,31 +9243,13 @@ bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers( if (DemandedElts.isZero()) return true; - auto AreExtractsCheaperThanScalars = [&]() { - // If extracts are cheaper than the original scalars - success. - InstructionCost ExtractCost = ::getScalarizationOverhead( - *TTI, UserScalarTy, UserVecTy, DemandedElts, - /*Insert=*/false, /*Extract=*/true, CostKind); - if (ExtractCost <= UserScalarsCost) - return true; - SmallPtrSet CheckedExtracts; - InstructionCost NodeCost = - getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts); - // The node is profitable for vectorization - success. - if (ExtractCost <= NodeCost) - return true; - auto *VecTy = getWidenedType(ScalarTy, VL.size()); - InstructionCost ScalarsCost = ::getScalarizationOverhead( - *TTI, ScalarTy, VecTy, APInt::getAllOnes(VL.size()), - /*Insert=*/true, /*Extract=*/false, CostKind); - if (!Mask.empty()) - ScalarsCost += - getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); - return ExtractCost < UserScalarsCost + ScalarsCost; - }; - // User extracts are cheaper than user scalars + immediate scalars - success. - return AreExtractsCheaperThanScalars(); + return areExtractsCheaperThanScalars( + *TTI, UserScalarTy, UserVecTy, DemandedElts, UserScalarsCost, ScalarTy, + VL.size(), Mask, [&]() { + SmallPtrSet CheckedExtracts; + return getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts); + }); } BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( @@ -13366,9 +13376,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }); InVectors.front() = V; } - if (!SubVectors.empty() && - (SubVectors.size() > 1 || SubVectors.front().second != 0 || - SubVectors.front().first->getVectorFactor() != CommonMask.size())) { + bool FullSubvectorMatch = + SubVectors.size() == 1 && SubVectors.front().second == 0 && + SubVectors.front().first->getVectorFactor() == CommonMask.size(); + if (!SubVectors.empty() && !FullSubvectorMatch) { const PointerUnion &Vec = InVectors.front(); if (InVectors.size() == 2) Cost += createShuffle(Vec, InVectors.back(), CommonMask); @@ -17466,8 +17477,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { }); InVectors.front() = Vec; } - if (SubVectors.size() == 1 && SubVectors.front().second == 0 && - SubVectors.front().first->getVectorFactor() == CommonMask.size()) { + const bool FullSubvectorMatch = + SubVectors.size() == 1 && SubVectors.front().second == 0 && + SubVectors.front().first->getVectorFactor() == CommonMask.size(); + if (FullSubvectorMatch) { Value *Vec = SubVectors.front().first->VectorizedValue; if (Vec->getType()->isIntOrIntVectorTy()) Vec = castToScalarTyElem( From ee156a1715a5fd9195ca5569f36e418429c2e88d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 29 Jul 2025 18:34:06 +0000 Subject: [PATCH 3/3] Precalculate the node cost Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c777379f9d244..9ec4b389300fd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9166,11 +9166,12 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, } /// Check if extracts are cheaper than the original scalars. -static bool areExtractsCheaperThanScalars( - TargetTransformInfo &TTI, Type *UserScalarTy, VectorType *UserVecTy, - const APInt &DemandedElts, const InstructionCost UserScalarsCost, - Type *ScalarTy, unsigned VF, ArrayRef Mask, - const llvm::function_ref GetUserEntryCost) { +static bool +areExtractsCheaperThanScalars(TargetTransformInfo &TTI, Type *UserScalarTy, + VectorType *UserVecTy, const APInt &DemandedElts, + const InstructionCost UserScalarsCost, + Type *ScalarTy, unsigned VF, ArrayRef Mask, + InstructionCost UserEntryCost) { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If extracts are cheaper than the original scalars - success. InstructionCost ExtractCost = @@ -9178,9 +9179,8 @@ static bool areExtractsCheaperThanScalars( /*Insert=*/false, /*Extract=*/true, CostKind); if (ExtractCost <= UserScalarsCost) return true; - InstructionCost NodeCost = GetUserEntryCost(); // The node is profitable for vectorization - success. - if (ExtractCost <= NodeCost) + if (ExtractCost <= UserEntryCost) return true; auto *VecTy = getWidenedType(ScalarTy, VF); InstructionCost ScalarsCost = @@ -9244,12 +9244,12 @@ bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers( return true; // User extracts are cheaper than user scalars + immediate scalars - success. - return areExtractsCheaperThanScalars( - *TTI, UserScalarTy, UserVecTy, DemandedElts, UserScalarsCost, ScalarTy, - VL.size(), Mask, [&]() { - SmallPtrSet CheckedExtracts; - return getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts); - }); + SmallPtrSet CheckedExtracts; + InstructionCost UserEntryCost = + getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts); + return areExtractsCheaperThanScalars(*TTI, UserScalarTy, UserVecTy, + DemandedElts, UserScalarsCost, ScalarTy, + VL.size(), Mask, UserEntryCost); } BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(