Skip to content

Commit 004c67e

Browse files
authored
[LV] Vectorize maxnum/minnum w/o fast-math flags. (#148239)
Update LV to vectorize maxnum/minnum reductions without fast-math flags, by adding an extra check in the loop if any inputs to maxnum/minnum are NaN, due to maxnum/minnum behavior w.r.t to signaling NaNs. Signed-zeros are already handled consistently by maxnum/minnum. If any input is NaN, *exit the vector loop, *compute the reduction result up to the vector iteration that contained NaN inputs and * resume in the scalar loop New recurrence kinds are added for reductions using maxnum/minnum without fast-math flags. PR: #148239
1 parent 695660c commit 004c67e

16 files changed

+731
-58
lines changed

llvm/include/llvm/Analysis/IVDescriptors.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ enum class RecurKind {
4747
FMul, ///< Product of floats.
4848
FMin, ///< FP min implemented in terms of select(cmp()).
4949
FMax, ///< FP max implemented in terms of select(cmp()).
50+
FMinNum, ///< FP min with llvm.minnum semantics including NaNs.
51+
FMaxNum, ///< FP max with llvm.maxnum semantics including NaNs.
5052
FMinimum, ///< FP min with llvm.minimum semantics
5153
FMaximum, ///< FP max with llvm.maximum semantics
5254
FMinimumNum, ///< FP min with llvm.minimumnum semantics
@@ -250,6 +252,7 @@ class RecurrenceDescriptor {
250252
/// Returns true if the recurrence kind is a floating-point min/max kind.
251253
static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
252254
return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
255+
Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum ||
253256
Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
254257
Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
255258
}

llvm/lib/Analysis/IVDescriptors.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
941941
m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
942942
match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
943943
};
944-
if (isIntMinMaxRecurrenceKind(Kind) ||
945-
(HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
944+
if (isIntMinMaxRecurrenceKind(Kind))
946945
return isMinMaxPattern(I, Kind, Prev);
947-
else if (isFMulAddIntrinsic(I))
946+
if (isFPMinMaxRecurrenceKind(Kind)) {
947+
InstDesc Res = isMinMaxPattern(I, Kind, Prev);
948+
if (!Res.isRecurrence())
949+
return InstDesc(false, I);
950+
if (HasRequiredFMF())
951+
return Res;
952+
// We may be able to vectorize FMax/FMin reductions using maxnum/minnum
953+
// intrinsics with extra checks ensuring the vector loop handles only
954+
// non-NaN inputs.
955+
if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) {
956+
assert(Kind == RecurKind::FMax &&
957+
"unexpected recurrence kind for maxnum");
958+
return InstDesc(I, RecurKind::FMaxNum);
959+
}
960+
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) {
961+
assert(Kind == RecurKind::FMin &&
962+
"unexpected recurrence kind for minnum");
963+
return InstDesc(I, RecurKind::FMinNum);
964+
}
965+
return InstDesc(false, I);
966+
}
967+
if (isFMulAddIntrinsic(I))
948968
return InstDesc(Kind == RecurKind::FMulAdd, I,
949969
I->hasAllowReassoc() ? nullptr : I);
950970
return InstDesc(false, I);

llvm/lib/Transforms/Utils/LoopUtils.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
938938
case RecurKind::UMin:
939939
return Intrinsic::vector_reduce_umin;
940940
case RecurKind::FMax:
941+
case RecurKind::FMaxNum:
941942
return Intrinsic::vector_reduce_fmax;
942943
case RecurKind::FMin:
944+
case RecurKind::FMinNum:
943945
return Intrinsic::vector_reduce_fmin;
944946
case RecurKind::FMaximum:
945947
return Intrinsic::vector_reduce_fmaximum;
@@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
10371039
case RecurKind::SMax:
10381040
return Intrinsic::smax;
10391041
case RecurKind::FMin:
1042+
case RecurKind::FMinNum:
10401043
return Intrinsic::minnum;
10411044
case RecurKind::FMax:
1045+
case RecurKind::FMaxNum:
10421046
return Intrinsic::maxnum;
10431047
case RecurKind::FMinimum:
10441048
return Intrinsic::minimum;
@@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
10961100
Value *Right) {
10971101
Type *Ty = Left->getType();
10981102
if (Ty->isIntOrIntVectorTy() ||
1099-
(RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
1103+
(RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum ||
1104+
RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
11001105
RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) {
1101-
// TODO: Add float minnum/maxnum support when FMF nnan is set.
11021106
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
11031107
return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
11041108
"rdx.minmax");
@@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
13081312
case RecurKind::UMin:
13091313
case RecurKind::FMax:
13101314
case RecurKind::FMin:
1315+
case RecurKind::FMinNum:
1316+
case RecurKind::FMaxNum:
13111317
case RecurKind::FMinimum:
13121318
case RecurKind::FMaximum:
13131319
case RecurKind::FMinimumNum:

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,6 @@ class VPBuilder {
230230

231231
/// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
232232
/// and \p B.
233-
/// TODO: add createFCmp when needed.
234233
VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
235234
DebugLoc DL = DebugLoc::getUnknown(),
236235
const Twine &Name = "") {
@@ -240,6 +239,17 @@ class VPBuilder {
240239
new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
241240
}
242241

242+
/// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
243+
/// and \p B.
244+
VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
245+
DebugLoc DL = DebugLoc::getUnknown(),
246+
const Twine &Name = "") {
247+
assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
248+
Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
249+
return tryInsertInstruction(
250+
new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
251+
}
252+
243253
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
244254
DebugLoc DL = DebugLoc::getUnknown(),
245255
const Twine &Name = "") {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4361,10 +4361,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
43614361

43624362
bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
43634363
ElementCount VF) const {
4364-
// Cross iteration phis such as reductions need special handling and are
4365-
// currently unsupported.
4366-
if (any_of(OrigLoop->getHeader()->phis(),
4367-
[&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4364+
// Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4365+
// reductions need special handling and are currently unsupported.
4366+
if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
4367+
if (!Legal->isReductionVariable(&Phi))
4368+
return Legal->isFixedOrderRecurrence(&Phi);
4369+
RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
4370+
return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
4371+
}))
43684372
return false;
43694373

43704374
// Phis with uses outside of the loop require special handling and are
@@ -8787,6 +8791,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
87878791
// Adjust the recipes for any inloop reductions.
87888792
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
87898793

8794+
// Apply mandatory transformation to handle FP maxnum/minnum reduction with
8795+
// NaNs if possible, bail out otherwise.
8796+
if (!VPlanTransforms::runPass(
8797+
VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath, *Plan))
8798+
return nullptr;
8799+
87908800
// Transform recipes to abstract recipes if it is legal and beneficial and
87918801
// clamp the range for better cost estimation.
87928802
// TODO: Enable following transform when the EVL-version of extended-reduction

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23202,6 +23202,8 @@ class HorizontalReduction {
2320223202
case RecurKind::FindFirstIVUMin:
2320323203
case RecurKind::FindLastIVSMax:
2320423204
case RecurKind::FindLastIVUMax:
23205+
case RecurKind::FMaxNum:
23206+
case RecurKind::FMinNum:
2320523207
case RecurKind::FMaximumNum:
2320623208
case RecurKind::FMinimumNum:
2320723209
case RecurKind::None:
@@ -23339,6 +23341,8 @@ class HorizontalReduction {
2333923341
case RecurKind::FindFirstIVUMin:
2334023342
case RecurKind::FindLastIVSMax:
2334123343
case RecurKind::FindLastIVUMax:
23344+
case RecurKind::FMaxNum:
23345+
case RecurKind::FMinNum:
2334223346
case RecurKind::FMaximumNum:
2334323347
case RecurKind::FMinimumNum:
2334423348
case RecurKind::None:
@@ -23441,6 +23445,8 @@ class HorizontalReduction {
2344123445
case RecurKind::FindFirstIVUMin:
2344223446
case RecurKind::FindLastIVSMax:
2344323447
case RecurKind::FindLastIVUMax:
23448+
case RecurKind::FMaxNum:
23449+
case RecurKind::FMinNum:
2344423450
case RecurKind::FMaximumNum:
2344523451
case RecurKind::FMinimumNum:
2344623452
case RecurKind::None:

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
8484
return ResTy;
8585
}
8686
case Instruction::ICmp:
87+
case Instruction::FCmp:
8788
case VPInstruction::ActiveLaneMask:
8889
assert(inferScalarType(R->getOperand(0)) ==
8990
inferScalarType(R->getOperand(1)) &&

llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,3 +652,163 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
652652
Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
653653
}
654654
}
655+
656+
bool VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan) {
657+
auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
658+
auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
659+
RedPhiR->getBackedgeValue()->getDefiningRecipe());
660+
if (!MinMaxR)
661+
return nullptr;
662+
663+
auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
664+
if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
665+
!(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
666+
return nullptr;
667+
668+
#ifndef NDEBUG
669+
Intrinsic::ID RdxIntrinsicId =
670+
RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
671+
: Intrinsic::minnum;
672+
assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
673+
cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
674+
RdxIntrinsicId) ||
675+
(RepR &&
676+
cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() ==
677+
RdxIntrinsicId) &&
678+
"Intrinsic did not match recurrence kind");
679+
#endif
680+
681+
if (MinMaxR->getOperand(0) == RedPhiR)
682+
return MinMaxR->getOperand(1);
683+
684+
assert(MinMaxR->getOperand(1) == RedPhiR &&
685+
"Reduction phi operand expected");
686+
return MinMaxR->getOperand(0);
687+
};
688+
689+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
690+
VPReductionPHIRecipe *RedPhiR = nullptr;
691+
bool HasUnsupportedPhi = false;
692+
for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
693+
if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
694+
continue;
695+
auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
696+
if (!Cur) {
697+
// TODO: Also support fixed-order recurrence phis.
698+
HasUnsupportedPhi = true;
699+
continue;
700+
}
701+
// For now, only a single reduction is supported.
702+
// TODO: Support multiple MaxNum/MinNum reductions and other reductions.
703+
if (RedPhiR)
704+
return false;
705+
if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
706+
Cur->getRecurrenceKind() != RecurKind::FMinNum) {
707+
HasUnsupportedPhi = true;
708+
continue;
709+
}
710+
RedPhiR = Cur;
711+
}
712+
713+
if (!RedPhiR)
714+
return true;
715+
716+
// We won't be able to resume execution in the scalar tail, if there are
717+
// unsupported header phis or there is no scalar tail at all, due to
718+
// tail-folding.
719+
if (HasUnsupportedPhi || !Plan.hasScalarTail())
720+
return false;
721+
722+
VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
723+
if (!MinMaxOp)
724+
return false;
725+
726+
RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
727+
assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
728+
"unsupported reduction");
729+
730+
/// Check if the vector loop of \p Plan can early exit and restart
731+
/// execution of last vector iteration in the scalar loop. This requires all
732+
/// recipes up to early exit point be side-effect free as they are
733+
/// re-executed. Currently we check that the loop is free of any recipe that
734+
/// may write to memory. Expected to operate on an early VPlan w/o nested
735+
/// regions.
736+
for (VPBlockBase *VPB : vp_depth_first_shallow(
737+
Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
738+
auto *VPBB = cast<VPBasicBlock>(VPB);
739+
for (auto &R : *VPBB) {
740+
if (R.mayWriteToMemory() &&
741+
!match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
742+
return false;
743+
}
744+
}
745+
746+
VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
747+
VPBuilder Builder(LatchVPBB->getTerminator());
748+
auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
749+
assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
750+
"Unexpected terminator");
751+
auto *IsLatchExitTaken =
752+
Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
753+
LatchExitingBranch->getOperand(1));
754+
755+
VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
756+
VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
757+
auto *AnyExitTaken =
758+
Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
759+
Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
760+
LatchExitingBranch->eraseFromParent();
761+
762+
// If we exit early due to NaNs, compute the final reduction result based on
763+
// the reduction phi at the beginning of the last vector iteration.
764+
auto *RdxResult = find_singleton<VPSingleDefRecipe>(
765+
RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
766+
auto *VPI = dyn_cast<VPInstruction>(U);
767+
if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
768+
return VPI;
769+
return nullptr;
770+
});
771+
772+
auto *MiddleVPBB = Plan.getMiddleBlock();
773+
Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
774+
auto *NewSel =
775+
Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
776+
RdxResult->setOperand(1, NewSel);
777+
778+
auto *ScalarPH = Plan.getScalarPreheader();
779+
// Update resume phis for inductions in the scalar preheader. If AnyNaN is
780+
// true, the resume from the start of the last vector iteration via the
781+
// canonical IV, otherwise from the original value.
782+
for (auto &R : ScalarPH->phis()) {
783+
auto *ResumeR = cast<VPPhi>(&R);
784+
VPValue *VecV = ResumeR->getOperand(0);
785+
if (VecV == RdxResult)
786+
continue;
787+
if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
788+
if (DerivedIV->getNumUsers() == 1 &&
789+
DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
790+
auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
791+
&Plan.getVectorTripCount());
792+
DerivedIV->moveAfter(&*Builder.getInsertPoint());
793+
DerivedIV->setOperand(1, NewSel);
794+
continue;
795+
}
796+
}
797+
// Bail out and abandon the current, partially modified, VPlan if we
798+
// encounter resume phi that cannot be updated yet.
799+
if (VecV != &Plan.getVectorTripCount()) {
800+
LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
801+
"FMaxNum/FMinNum reduction.\n");
802+
return false;
803+
}
804+
auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
805+
ResumeR->setOperand(0, NewSel);
806+
}
807+
808+
auto *MiddleTerm = MiddleVPBB->getTerminator();
809+
Builder.setInsertPoint(MiddleTerm);
810+
VPValue *MiddleCond = MiddleTerm->getOperand(0);
811+
VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
812+
MiddleTerm->setOperand(0, NewCond);
813+
return true;
814+
}

0 commit comments

Comments
 (0)