Skip to content

Commit 20fc297

Browse files
authored
[LoopVectorizer] Only check register pressure for VFs that have been enabled via maxBandwidth (#149056)
Currently if MaxBandwidth is enabled, the register pressure is checked for each VF. This changes that to only perform said check if the VF would not have otherwise been considered by the LoopVectorizer if maxBandwidth was not enabled. Theoretically this allows for higher VFs to be considered than would otherwise be deemed "safe" (from a regpressure perspective), but more concretely this reduces the amount of work done at compile-time when maxBandwidth is enabled.
1 parent 8f3e78f commit 20fc297

File tree

4 files changed

+74
-16
lines changed

4 files changed

+74
-16
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -947,9 +947,8 @@ class LoopVectorizationCostModel {
947947
/// user options, for the given register kind.
948948
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
949949

950-
/// \return True if maximizing vector bandwidth is enabled by the target or
951-
/// user options, for the given vector factor.
952-
bool useMaxBandwidth(ElementCount VF);
950+
/// \return True if register pressure should be calculated for the given VF.
951+
bool shouldCalculateRegPressureForVF(ElementCount VF);
953952

954953
/// \return The size (in bits) of the smallest and widest types in the code
955954
/// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1736,6 +1735,9 @@ class LoopVectorizationCostModel {
17361735
/// Whether this loop should be optimized for size based on function attribute
17371736
/// or profile information.
17381737
bool OptForSize;
1738+
1739+
/// The highest VF possible for this loop, without using MaxBandwidth.
1740+
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
17391741
};
17401742
} // end namespace llvm
17411743

@@ -3832,10 +3834,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
38323834
return FixedScalableVFPair::getNone();
38333835
}
38343836

3835-
bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
3836-
return useMaxBandwidth(VF.isScalable()
3837-
? TargetTransformInfo::RGK_ScalableVector
3838-
: TargetTransformInfo::RGK_FixedWidthVector);
3837+
bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
3838+
ElementCount VF) {
3839+
if (!useMaxBandwidth(VF.isScalable()
3840+
? TargetTransformInfo::RGK_ScalableVector
3841+
: TargetTransformInfo::RGK_FixedWidthVector))
3842+
return false;
3843+
// Only calculate register pressure for VFs enabled by MaxBandwidth.
3844+
return ElementCount::isKnownGT(
3845+
VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3846+
: MaxPermissibleVFWithoutMaxBW.FixedVF);
38393847
}
38403848

38413849
bool LoopVectorizationCostModel::useMaxBandwidth(
@@ -3911,6 +3919,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
39113919
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
39123920
: TargetTransformInfo::RGK_FixedWidthVector;
39133921
ElementCount MaxVF = MaxVectorElementCount;
3922+
3923+
if (MaxVF.isScalable())
3924+
MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3925+
else
3926+
MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3927+
39143928
if (useMaxBandwidth(RegKind)) {
39153929
auto MaxVectorElementCountMaxBW = ElementCount::get(
39163930
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
@@ -4264,9 +4278,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
42644278
if (VF.isScalar())
42654279
continue;
42664280

4267-
/// Don't consider the VF if it exceeds the number of registers for the
4268-
/// target.
4269-
if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
4281+
/// If the VF was proposed due to MaxBandwidth, don't consider the VF if
4282+
/// it exceeds the number of registers for the target.
4283+
if (CM.shouldCalculateRegPressureForVF(VF) &&
4284+
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
42704285
continue;
42714286

42724287
InstructionCost C = CM.expectedCost(VF);
@@ -7044,7 +7059,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
70447059
InstructionCost Cost = cost(*P, VF);
70457060
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
70467061

7047-
if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
7062+
if (CM.shouldCalculateRegPressureForVF(VF) &&
7063+
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
70487064
LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
70497065
<< VF << " because it uses too many registers\n");
70507066
continue;

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -404,9 +404,12 @@ static unsigned getVFScaleFactor(VPRecipeBase *R) {
404404
return 1;
405405
}
406406

407-
bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const {
408-
return any_of(MaxLocalUsers, [&TTI](auto &LU) {
409-
return LU.second > TTI.getNumberOfRegisters(LU.first);
407+
bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
408+
unsigned OverrideMaxNumRegs) const {
409+
return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
410+
return LU.second > (OverrideMaxNumRegs > 0
411+
? OverrideMaxNumRegs
412+
: TTI.getNumberOfRegisters(LU.first));
410413
});
411414
}
412415

llvm/lib/Transforms/Vectorize/VPlanAnalysis.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,10 @@ struct VPRegisterUsage {
8585
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
8686

8787
/// Check if any of the tracked live intervals exceeds the number of
88-
/// available registers for the target.
89-
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const;
88+
/// available registers for the target. If non-zero, OverrideMaxNumRegs
89+
/// is used in place of the target's number of registers.
90+
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI,
91+
unsigned OverrideMaxNumRegs = 0) const;
9092
};
9193

9294
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
2+
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP
3+
4+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
5+
target triple = "aarch64-none-unknown-elf"
6+
7+
define i32 @dotp(ptr %a, ptr %b) #0 {
8+
; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
9+
; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
10+
;
11+
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
12+
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
13+
; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
14+
entry:
15+
br label %for.body
16+
17+
for.body: ; preds = %for.body, %entry
18+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
19+
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
20+
%gep.a = getelementptr i8, ptr %a, i64 %iv
21+
%load.a = load i8, ptr %gep.a, align 1
22+
%ext.a = zext i8 %load.a to i32
23+
%gep.b = getelementptr i8, ptr %b, i64 %iv
24+
%load.b = load i8, ptr %gep.b, align 1
25+
%ext.b = zext i8 %load.b to i32
26+
%mul = mul i32 %ext.b, %ext.a
27+
%sub = sub i32 0, %mul
28+
%add = add i32 %accum, %sub
29+
%iv.next = add i64 %iv, 1
30+
%exitcond.not = icmp eq i64 %iv.next, 1024
31+
br i1 %exitcond.not, label %for.exit, label %for.body
32+
33+
for.exit: ; preds = %for.body
34+
ret i32 %add
35+
}
36+
37+
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

0 commit comments

Comments
 (0)