@@ -1505,6 +1505,11 @@ class LoopVectorizationCostModel {
1505
1505
ElementCount UserVF,
1506
1506
bool FoldTailByMasking);
1507
1507
1508
+ // / If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
1509
+ // / MaxTripCount.
1510
+ ElementCount clampVFByMaxTripCount (ElementCount VF, unsigned MaxTripCount,
1511
+ bool FoldTailByMasking) const ;
1512
+
1508
1513
// / \return the maximized element count based on the targets vector
1509
1514
// / registers and the loop trip-count, but limited to a maximum safe VF.
1510
1515
// / This is a helper function of computeFeasibleMaxVF.
@@ -3854,6 +3859,38 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
3854
3859
Legal->hasVectorCallVariants ())));
3855
3860
}
3856
3861
3862
+ ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount (
3863
+ ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
3864
+ unsigned EstimatedVF = VF.getKnownMinValue ();
3865
+ if (VF.isScalable () && TheFunction->hasFnAttribute (Attribute::VScaleRange)) {
3866
+ auto Attr = TheFunction->getFnAttribute (Attribute::VScaleRange);
3867
+ auto Min = Attr.getVScaleRangeMin ();
3868
+ EstimatedVF *= Min;
3869
+ }
3870
+
3871
+ // When a scalar epilogue is required, at least one iteration of the scalar
3872
+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3873
+ // max VF that results in a dead vector loop.
3874
+ if (MaxTripCount > 0 && requiresScalarEpilogue (true ))
3875
+ MaxTripCount -= 1 ;
3876
+
3877
+ if (MaxTripCount && MaxTripCount <= EstimatedVF &&
3878
+ (!FoldTailByMasking || isPowerOf2_32 (MaxTripCount))) {
3879
+ // If upper bound loop trip count (TC) is known at compile time there is no
3880
+ // point in choosing VF greater than TC (as done in the loop below). Select
3881
+ // maximum power of two which doesn't exceed TC. If VF is
3882
+ // scalable, we only fall back on a fixed VF when the TC is less than or
3883
+ // equal to the known number of lanes.
3884
+ auto ClampedUpperTripCount = llvm::bit_floor (MaxTripCount);
3885
+ LLVM_DEBUG (dbgs () << " LV: Clamping the MaxVF to maximum power of two not "
3886
+ " exceeding the constant trip count: "
3887
+ << ClampedUpperTripCount << " \n " );
3888
+ return ElementCount::get (ClampedUpperTripCount,
3889
+ FoldTailByMasking ? VF.isScalable () : false );
3890
+ }
3891
+ return VF;
3892
+ }
3893
+
3857
3894
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget (
3858
3895
unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3859
3896
ElementCount MaxSafeVF, bool FoldTailByMasking) {
@@ -3885,40 +3922,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3885
3922
return ElementCount::getFixed (1 );
3886
3923
}
3887
3924
3888
- unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue ();
3889
- if (MaxVectorElementCount.isScalable () &&
3890
- TheFunction->hasFnAttribute (Attribute::VScaleRange)) {
3891
- auto Attr = TheFunction->getFnAttribute (Attribute::VScaleRange);
3892
- auto Min = Attr.getVScaleRangeMin ();
3893
- WidestRegisterMinEC *= Min;
3894
- }
3895
-
3896
- // When a scalar epilogue is required, at least one iteration of the scalar
3897
- // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3898
- // max VF that results in a dead vector loop.
3899
- if (MaxTripCount > 0 && requiresScalarEpilogue (true ))
3900
- MaxTripCount -= 1 ;
3901
-
3902
- if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
3903
- (!FoldTailByMasking || isPowerOf2_32 (MaxTripCount))) {
3904
- // If upper bound loop trip count (TC) is known at compile time there is no
3905
- // point in choosing VF greater than TC (as done in the loop below). Select
3906
- // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
3907
- // scalable, we only fall back on a fixed VF when the TC is less than or
3908
- // equal to the known number of lanes.
3909
- auto ClampedUpperTripCount = llvm::bit_floor (MaxTripCount);
3910
- LLVM_DEBUG (dbgs () << " LV: Clamping the MaxVF to maximum power of two not "
3911
- " exceeding the constant trip count: "
3912
- << ClampedUpperTripCount << " \n " );
3913
- return ElementCount::get (
3914
- ClampedUpperTripCount,
3915
- FoldTailByMasking ? MaxVectorElementCount.isScalable () : false );
3916
- }
3925
+ ElementCount MaxVF = clampVFByMaxTripCount (MaxVectorElementCount,
3926
+ MaxTripCount, FoldTailByMasking);
3927
+ if (MaxVF != MaxVectorElementCount)
3928
+ return MaxVF;
3917
3929
3918
3930
TargetTransformInfo::RegisterKind RegKind =
3919
3931
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3920
3932
: TargetTransformInfo::RGK_FixedWidthVector;
3921
- ElementCount MaxVF = MaxVectorElementCount;
3922
3933
3923
3934
if (MaxVF.isScalable ())
3924
3935
MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
@@ -3940,6 +3951,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3940
3951
}
3941
3952
}
3942
3953
3954
+ MaxVF = clampVFByMaxTripCount (MaxVF, MaxTripCount, FoldTailByMasking);
3955
+
3943
3956
// Invalidate any widening decisions we might have made, in case the loop
3944
3957
// requires prediction (decided later), but we have already made some
3945
3958
// load/store widening decisions.
0 commit comments