-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[SLP] Check for extracts, being replaced by original scalars, for user nodes #149572
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2008,6 +2008,7 @@ class BoUpSLP { | |
void deleteTree() { | ||
VectorizableTree.clear(); | ||
ScalarToTreeEntries.clear(); | ||
PostponedNodesWithNonVecUsers.clear(); | ||
OperandsToTreeEntry.clear(); | ||
ScalarsInSplitNodes.clear(); | ||
MustGather.clear(); | ||
|
@@ -4016,6 +4017,9 @@ class BoUpSLP { | |
/// Returns true if any scalar in the list is a copyable element. | ||
bool hasCopyableElements() const { return !CopyableElements.empty(); } | ||
|
||
/// Returns the state of the operations. | ||
const InstructionsState &getOperations() const { return S; } | ||
|
||
/// When ReuseReorderShuffleIndices is empty it just returns position of \p | ||
/// V within vector of Scalars. Otherwise, try to remap on its reuse index. | ||
unsigned findLaneForValue(Value *V) const { | ||
|
@@ -4419,6 +4423,13 @@ class BoUpSLP { | |
OrdersType &CurrentOrder, | ||
SmallVectorImpl<Value *> &PointerOps); | ||
|
||
/// Checks if it is profitable to vectorize the specified list of the | ||
/// instructions if not all users are vectorized. | ||
bool isProfitableToVectorizeWithNonVecUsers(const InstructionsState &S, | ||
const EdgeInfo &UserTreeIdx, | ||
ArrayRef<Value *> VL, | ||
ArrayRef<int> Mask); | ||
|
||
/// Maps a specific scalar to its tree entry(ies). | ||
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries; | ||
|
||
|
@@ -4429,6 +4440,9 @@ class BoUpSLP { | |
/// Scalars, used in split vectorize nodes. | ||
SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes; | ||
|
||
/// List of tree nodes indices, which have non-vectorized users. | ||
SmallSet<unsigned, 4> PostponedNodesWithNonVecUsers; | ||
|
||
/// Maps a value to the proposed vectorizable size. | ||
SmallDenseMap<Value *, unsigned> InstrElementSize; | ||
|
||
|
@@ -9151,6 +9165,93 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, | |
return {IntrinsicCost, LibCost}; | ||
} | ||
|
||
/// Check if extracts are cheaper than the original scalars. | ||
static bool | ||
areExtractsCheaperThanScalars(TargetTransformInfo &TTI, Type *UserScalarTy, | ||
VectorType *UserVecTy, const APInt &DemandedElts, | ||
const InstructionCost UserScalarsCost, | ||
Type *ScalarTy, unsigned VF, ArrayRef<int> Mask, | ||
InstructionCost UserEntryCost) { | ||
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||
// If extracts are cheaper than the original scalars - success. | ||
InstructionCost ExtractCost = | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
::getScalarizationOverhead(TTI, UserScalarTy, UserVecTy, DemandedElts, | ||
/*Insert=*/false, /*Extract=*/true, CostKind); | ||
if (ExtractCost <= UserScalarsCost) | ||
return true; | ||
// The node is profitable for vectorization - success. | ||
if (ExtractCost <= UserEntryCost) | ||
return true; | ||
auto *VecTy = getWidenedType(ScalarTy, VF); | ||
InstructionCost ScalarsCost = | ||
::getScalarizationOverhead(TTI, ScalarTy, VecTy, APInt::getAllOnes(VF), | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/*Insert=*/true, /*Extract=*/false, CostKind); | ||
if (!Mask.empty()) | ||
ScalarsCost += | ||
getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind); | ||
return ExtractCost < UserScalarsCost + ScalarsCost; | ||
} | ||
|
||
bool BoUpSLP::isProfitableToVectorizeWithNonVecUsers( | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
const InstructionsState &S, const EdgeInfo &UserTreeIdx, | ||
ArrayRef<Value *> VL, ArrayRef<int> Mask) { | ||
assert(S && "Expected valid instructions state."); | ||
// Loads, extracts and geps are immediately scalarizable, so no need to check. | ||
if (S.getOpcode() == Instruction::Load || | ||
S.getOpcode() == Instruction::ExtractElement || | ||
S.getOpcode() == Instruction::GetElementPtr) | ||
return true; | ||
// Check only vectorized users, others scalarized (potentially, at least) | ||
// already. | ||
if (!UserTreeIdx.UserTE || UserTreeIdx.UserTE->isGather() || | ||
UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize) | ||
return true; | ||
// PHI nodes may have cyclic deps, so cannot check here. | ||
if (UserTreeIdx.UserTE->getOpcode() == Instruction::PHI) | ||
return true; | ||
// Do not check root reduction nodes, they do not have non-vectorized users. | ||
if (UserIgnoreList && UserTreeIdx.UserTE->Idx == 0) | ||
return true; | ||
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||
ArrayRef<Value *> UserVL = UserTreeIdx.UserTE->Scalars; | ||
Type *UserScalarTy = getValueType(UserVL.front()); | ||
if (!isValidElementType(UserScalarTy)) | ||
return true; | ||
Type *ScalarTy = getValueType(VL.front()); | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (!isValidElementType(ScalarTy)) | ||
return true; | ||
// Ignore subvectors extracts for revectorized nodes, subvector extracts are | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// always cheap as they do not require vector-to-scalar move. | ||
if (UserScalarTy->isVectorTy()) | ||
return true; | ||
auto *UserVecTy = | ||
getWidenedType(UserScalarTy, UserTreeIdx.UserTE->getVectorFactor()); | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
APInt DemandedElts = APInt::getZero(UserTreeIdx.UserTE->getVectorFactor()); | ||
// Check the external uses and check, if vector node + extracts is not | ||
// profitable for the vectorization. | ||
InstructionCost UserScalarsCost = 0; | ||
for (Value *V : UserVL) { | ||
auto *I = dyn_cast<Instruction>(V); | ||
if (!I) | ||
continue; | ||
if (areAllUsersVectorized(I, UserIgnoreList)) | ||
continue; | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
DemandedElts.setBit(UserTreeIdx.UserTE->findLaneForValue(V)); | ||
UserScalarsCost += TTI->getInstructionCost(I, CostKind); | ||
} | ||
// No non-vectorized users - success. | ||
if (DemandedElts.isZero()) | ||
return true; | ||
|
||
// User extracts are cheaper than user scalars + immediate scalars - success. | ||
SmallPtrSet<Value *, 4> CheckedExtracts; | ||
InstructionCost UserEntryCost = | ||
getEntryCost(UserTreeIdx.UserTE, {}, CheckedExtracts); | ||
return areExtractsCheaperThanScalars(*TTI, UserScalarTy, UserVecTy, | ||
DemandedElts, UserScalarsCost, ScalarTy, | ||
VL.size(), Mask, UserEntryCost); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of creating another lambda which is here to capture |
||
} | ||
|
||
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( | ||
const InstructionsState &S, ArrayRef<Value *> VL, | ||
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, | ||
|
@@ -10700,6 +10801,15 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth, | |
return; | ||
} | ||
|
||
// Postpone vectorization, if the node is not profitable because of the | ||
// external uses. | ||
if (!isProfitableToVectorizeWithNonVecUsers(S, UserTreeIdx, VL, | ||
ReuseShuffleIndices)) { | ||
PostponedNodesWithNonVecUsers.insert(VectorizableTree.size()); | ||
newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); | ||
return; | ||
} | ||
|
||
Instruction *VL0 = S.getMainOp(); | ||
BasicBlock *BB = VL0->getParent(); | ||
auto &BSRef = BlocksSchedules[BB]; | ||
|
@@ -12085,6 +12195,27 @@ void BoUpSLP::transformNodes() { | |
ArrayRef<Value *> VL = E.Scalars; | ||
const unsigned Sz = getVectorElementSize(VL.front()); | ||
unsigned MinVF = getMinVF(2 * Sz); | ||
const EdgeInfo &EI = E.UserTreeIndex; | ||
// Try to vectorized postponed scalars, if external uses are vectorized. | ||
if (PostponedNodesWithNonVecUsers.contains(E.Idx) && | ||
isProfitableToVectorizeWithNonVecUsers( | ||
E.getOperations(), EI, E.Scalars, E.ReuseShuffleIndices)) { | ||
assert(E.hasState() && "Expected to have state"); | ||
unsigned PrevSize = VectorizableTree.size(); | ||
[[maybe_unused]] unsigned PrevEntriesSize = | ||
LoadEntriesToVectorize.size(); | ||
buildTreeRec(VL, 0, EdgeInfo(&E, UINT_MAX)); | ||
if (PrevSize + 1 == VectorizableTree.size() && | ||
VectorizableTree[PrevSize]->isGather()) { | ||
VectorizableTree.pop_back(); | ||
assert(PrevEntriesSize == LoadEntriesToVectorize.size() && | ||
"LoadEntriesToVectorize expected to remain the same"); | ||
} else { | ||
E.CombinedEntriesWithIndices.emplace_back(PrevSize, 0); | ||
continue; | ||
} | ||
} | ||
|
||
// Do not try partial vectorization for small nodes (<= 2), nodes with the | ||
// same opcode and same parent block or all constants. | ||
if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || | ||
|
@@ -13245,7 +13376,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { | |
}); | ||
InVectors.front() = V; | ||
} | ||
if (!SubVectors.empty()) { | ||
bool FullSubvectorMatch = | ||
SubVectors.size() == 1 && SubVectors.front().second == 0 && | ||
SubVectors.front().first->getVectorFactor() == CommonMask.size(); | ||
if (!SubVectors.empty() && !FullSubvectorMatch) { | ||
const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); | ||
if (InVectors.size() == 2) | ||
Cost += createShuffle(Vec, InVectors.back(), CommonMask); | ||
|
@@ -13356,6 +13490,16 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { | |
InstructionCost | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, | ||
SmallPtrSetImpl<Value *> &CheckedExtracts) { | ||
// No need to count the cost for combined entries, they are combined and | ||
// just skip their cost. | ||
if (E->State == TreeEntry::CombinedVectorize) { | ||
LLVM_DEBUG( | ||
dbgs() << "SLP: Skipping cost for combined node that starts with " | ||
<< E->Scalars[0] << ".\n"; | ||
E->dump()); | ||
return 0; | ||
} | ||
|
||
ArrayRef<Value *> VL = E->Scalars; | ||
|
||
Type *ScalarTy = getValueType(VL[0]); | ||
|
@@ -13767,7 +13911,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, | |
case Instruction::Trunc: | ||
case Instruction::FPTrunc: | ||
case Instruction::BitCast: { | ||
auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); | ||
auto SrcIt = | ||
MinBWs.empty() ? MinBWs.end() : MinBWs.find(getOperandEntry(E, 0)); | ||
Type *SrcScalarTy = VL0->getOperand(0)->getType(); | ||
auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size()); | ||
unsigned Opcode = ShuffleOrOp; | ||
|
@@ -14214,7 +14359,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, | |
Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType(); | ||
auto *SrcTy = getWidenedType(SrcSclTy, VL.size()); | ||
if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) { | ||
auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); | ||
auto SrcIt = MinBWs.empty() ? MinBWs.end() | ||
: MinBWs.find(getOperandEntry(E, 0)); | ||
unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); | ||
unsigned SrcBWSz = | ||
DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType()); | ||
|
@@ -15019,15 +15165,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals, | |
SmallPtrSet<Value *, 4> CheckedExtracts; | ||
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { | ||
TreeEntry &TE = *VectorizableTree[I]; | ||
// No need to count the cost for combined entries, they are combined and | ||
// just skip their cost. | ||
if (TE.State == TreeEntry::CombinedVectorize) { | ||
LLVM_DEBUG( | ||
dbgs() << "SLP: Skipping cost for combined node that starts with " | ||
<< *TE.Scalars[0] << ".\n"; | ||
TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); | ||
continue; | ||
} | ||
if (TE.hasState() && | ||
(TE.isGather() || TE.State == TreeEntry::SplitVectorize)) { | ||
if (const TreeEntry *E = | ||
|
@@ -15242,6 +15379,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals, | |
auto *Inst = cast<Instruction>(EU.Scalar); | ||
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind); | ||
auto OperandIsScalar = [&](Value *V) { | ||
if (!isa<Instruction>(V)) | ||
return true; | ||
if (!isVectorized(V)) { | ||
// Some extractelements might be not vectorized, but | ||
// transformed into shuffle and removed from the function, | ||
|
@@ -17338,7 +17477,20 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { | |
}); | ||
InVectors.front() = Vec; | ||
} | ||
if (!SubVectors.empty()) { | ||
const bool FullSubvectorMatch = | ||
SubVectors.size() == 1 && SubVectors.front().second == 0 && | ||
SubVectors.front().first->getVectorFactor() == CommonMask.size(); | ||
if (FullSubvectorMatch) { | ||
Value *Vec = SubVectors.front().first->VectorizedValue; | ||
if (Vec->getType()->isIntOrIntVectorTy()) | ||
Vec = castToScalarTyElem( | ||
Vec, any_of(SubVectors.front().first->Scalars, [&](Value *V) { | ||
if (isa<PoisonValue>(V)) | ||
return false; | ||
return !isKnownNonNegative(V, SimplifyQuery(*R.DL)); | ||
})); | ||
transformMaskAfterShuffle(CommonMask, CommonMask); | ||
} else if (!SubVectors.empty()) { | ||
Value *Vec = InVectors.front(); | ||
if (InVectors.size() == 2) { | ||
Vec = createShuffle(Vec, InVectors.back(), CommonMask); | ||
|
Uh oh!
There was an error while loading. Please reload this page.