-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[SLP]Check for extracts, being replaced by original scalares, for user nodes #149572
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Alexey Bataev (alexey-bataev) ChangesBetter to check, if the extracts for external uses of the user nodes are Patch is 23.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149572.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6ad5c60105a28..14d4e45d1ab6f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1906,6 +1906,7 @@ class BoUpSLP {
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntries.clear();
+ PostponedNodesWithNonVecUsers.clear();
OperandsToTreeEntry.clear();
ScalarsInSplitNodes.clear();
MustGather.clear();
@@ -3896,6 +3897,9 @@ class BoUpSLP {
bool hasState() const { return S.valid(); }
+ /// Returns the state of the operations.
+ const InstructionsState &getOperations() const { return S; }
+
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
unsigned findLaneForValue(Value *V) const {
@@ -4290,6 +4294,13 @@ class BoUpSLP {
OrdersType &CurrentOrder,
SmallVectorImpl<Value *> &PointerOps);
+ /// Checks if it is profitable to vectorize the specified list of the
+ /// instructions if not all users are vectorized.
+ bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S,
+ const EdgeInfo &UserTreeIdx,
+ ArrayRef<Value *> Scalars,
+ ArrayRef<int> ScalarsMask);
+
/// Maps a specific scalar to its tree entry(ies).
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
@@ -4300,6 +4311,9 @@ class BoUpSLP {
/// Scalars, used in split vectorize nodes.
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
+ /// List of tree nodes indices, which have non-vectorized users.
+ SmallSet<unsigned, 4> PostponedNodesWithNonVecUsers;
+
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
@@ -8993,6 +9007,81 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
+bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers(
+ const InstructionsState &S, const EdgeInfo &UserTreeIdx,
+ ArrayRef<Value *> Scalars, ArrayRef<int> ScalarsMask) {
+ assert(S && "Expected valid instructions state.");
+ // Loads, extracts and geps are immediately scalarizable, so no need to check.
+ if (S.getOpcode() == Instruction::Load ||
+ S.getOpcode() == Instruction::ExtractElement ||
+ S.getOpcode() == Instruction::GetElementPtr)
+ return true;
+ // Check only vectorized users, others scalarized (potentially, at least)
+ // already.
+ if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() ||
+ UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize)
+ return true;
+ // PHI nodes may have cyclic deps, so cannot check here.
+ if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI)
+ return true;
+ // Do not check root reduction nodes, they do not have non-vectorized users.
+ if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0)
+ return true;
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ ArrayRef<Value *> VL = UserTreeIdx.UserTE->Scalars;
+ Type *UserScalarTy = getValueType(VL.front());
+ if (!isValidElementType(UserScalarTy))
+ return true;
+ Type *ScalarTy = getValueType(Scalars.front());
+ if (!isValidElementType(ScalarTy))
+ return true;
+ // Ignore subvectors extracts.
+ if (UserScalarTy->isVectorTy())
+ return true;
+ auto *UserVecTy =
+ getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor());
+ APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor());
+ // Check the external uses and check, if vector node + extracts is not
+ // profitable for the vectorization.
+ InstructionCost UserScalarsCost = 0;
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (areAllUsersVectorized(I, UserIgnoreList))
+ continue;
+ DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V));
+ UserScalarsCost += TTI->getInstructionCost(I, CostKind);
+ }
+ // No non-vectorized users - success.
+ if (DemandedElts.isZero())
+ return true;
+ // If extracts are cheaper than the original scalars - success.
+ InstructionCost ExtractCost =
+ ::getScalarizationOverhead(*TTI, UserScalarTy, UserVecTy, DemandedElts,
+ /*Insert=*/false, /*Extract=*/true, CostKind);
+ if (ExtractCost <= UserScalarsCost)
+ return true;
+ SmallPtrSet<Value *, 4> CheckedExtracts;
+ InstructionCost NodeCost =
+ UserTreeIdx.UserTE->State == TreeEntry::CombinedVectorize
+ ? InstructionCost(0)
+ : getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts);
+ // The node is profitable for vectorization - success.
+ if (ExtractCost + NodeCost <= -SLPCostThreshold)
+ return true;
+ auto *VecTy = getWidenedType(ScalarTy, Scalars.size());
+ InstructionCost ScalarsCost = ::getScalarizationOverhead(
+ *TTI, ScalarTy, VecTy, APInt::getAllOnes(Scalars.size()),
+ /*Insert=*/true, /*Extract=*/false, CostKind);
+ if (!ScalarsMask.empty())
+ ScalarsCost += getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy,
+ ScalarsMask, CostKind);
+
+ // User extracts are cheaper than user scalars + immediate scalars - success.
+ return ExtractCost - (UserScalarsCost + ScalarsCost) < -SLPCostThreshold;
+}
+
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -10283,6 +10372,17 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
return;
}
+ // Postpone vectorization, if the node is not profitable because of the
+ // external uses.
+ if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL,
+ ReuseShuffleIndices)) {
+ PostponedNodesWithNonVecUsers.insert(VectorizableTree.size());
+ auto Invalid = ScheduleBundle::invalid();
+ newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ return;
+ }
+
Instruction *VL0 = S.getMainOp();
BasicBlock *BB = VL0->getParent();
auto &BSRef = BlocksSchedules[BB];
@@ -11668,6 +11768,27 @@ void BoUpSLP::transformNodes() {
ArrayRef<Value *> VL = E.Scalars;
const unsigned Sz = getVectorElementSize(VL.front());
unsigned MinVF = getMinVF(2 * Sz);
+ const EdgeInfo &EI = E.UserTreeIndex;
+ // Try to vectorized postponed scalars, if external uses are vectorized.
+ if (PostponedNodesWithNonVecUsers.contains(E.Idx) &&
+ isProfitableToVectorizeWithNonVecUsers(
+ E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) {
+ assert(E.hasState() && "Expected to have state");
+ unsigned PrevSize = VectorizableTree.size();
+ [[maybe_unused]] unsigned PrevEntriesSize =
+ LoadEntriesToVectorize.size();
+ buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX));
+ if (PrevSize + 1 == VectorizableTree.size() &&
+ VectorizableTree[PrevSize]->isGather()) {
+ VectorizableTree.pop_back();
+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+ "LoadEntriesToVectorize expected to remain the same");
+ } else {
+ E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0);
+ continue;
+ }
+ }
+
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
@@ -12828,7 +12949,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
});
InVectors.front() = V;
}
- if (!SubVectors.empty()) {
+ if (!SubVectors.empty() &&
+ (SubVectors.size() > 1 || SubVectors.front().second != 0 ||
+ SubVectors.front().first->getVectorFactor() != CommonMask.size())) {
const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
if (InVectors.size() == 2)
Cost += createShuffle(Vec, InVectors.back(), CommonMask);
@@ -13348,7 +13471,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ auto SrcIt =
+ MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0));
Type *SrcScalarTy = VL0->getOperand(0)->getType();
auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
unsigned Opcode = ShuffleOrOp;
@@ -13795,7 +13919,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
- auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ auto SrcIt = MinBWs.empty() ? MinBWs.end()
+ : MinBWs.find(getOperandEntry(E, 0));
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
unsigned SrcBWSz =
DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
@@ -14793,6 +14918,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
auto *Inst = cast<Instruction>(EU.Scalar);
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
auto OperandIsScalar = [&](Value *V) {
+ if (!isa<Instruction>(V))
+ return true;
if (!isVectorized(V)) {
// Some extractelements might be not vectorized, but
// transformed into shuffle and removed from the function,
@@ -16873,7 +17000,18 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
});
InVectors.front() = Vec;
}
- if (!SubVectors.empty()) {
+ if (SubVectors.size() == 1 && SubVectors.front().second == 0 &&
+ SubVectors.front().first->getVectorFactor() == CommonMask.size()) {
+ Value *Vec = SubVectors.front().first->VectorizedValue;
+ if (Vec->getType()->isIntOrIntVectorTy())
+ Vec = castToScalarTyElem(
+ Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) {
+ if (isa<PoisonValue>(V))
+ return false;
+ return !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+ }));
+ transformMaskAfterShuffle(CommonMask, CommonMask);
+ } else if (!SubVectors.empty()) {
Value *Vec = InVectors.front();
if (InVectors.size() == 2) {
Vec = createShuffle(Vec, InVectors.back(), CommonMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index d4e323819402c..117c892b79f9a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -102,80 +102,88 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
-; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
-; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
-; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
-; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
+; THRESH-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; THRESH-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; THRESH-NEXT: [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THRESH-NEXT: [[TMP22:%.*]] = or <2 x i32> [[TMP19]], [[TMP21]]
; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
-; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
-; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 1
+; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[SHR143_5_I_I9]], i32 0
+; THRESH-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT: [[TMP14:%.*]] = or <2 x i32> [[TMP12]], splat (i32 1)
+; THRESH-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> <i32 0, i32 3>
; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[SHR2303_I]], i32 2
+; THRESH-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[SHR2175_I]], i32 3
+; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
+; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
+; THRESH-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP24:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
-; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
-; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT: [[TMP26:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT: [[TMP27:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT: store <2 x i16> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT: [[CONV...
[truncated]
|
Ping! |
Better to check, if the extracts for external uses of the user nodes are
expensive, and they better to be represented as original scalars. In
this case, better to keep their operands scalar too, i.e. for
buildvector node. Otherwise, expensive extracts will be creating,
pessimizing the cost of the tree.