Skip to content

Commit a9abd9e

Browse files
fhahngithub-actions[bot]
authored andcommitted
Automerge: [LV] Also clamp MaxVF by trip count when maximizing vector bandwidth. (#149794)
Also clamp the max VF when maximizing vector bandwidth by the maximum trip count. Otherwise we may end up choosing a VF for which the vector loop never executes. PR: llvm/llvm-project#149794
2 parents dbe5333 + 77b1b95 commit a9abd9e

File tree

3 files changed

+103
-91
lines changed

3 files changed

+103
-91
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 51 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,6 +1506,11 @@ class LoopVectorizationCostModel {
15061506
ElementCount UserVF,
15071507
bool FoldTailByMasking);
15081508

1509+
/// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
1510+
/// MaxTripCount.
1511+
ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1512+
bool FoldTailByMasking) const;
1513+
15091514
/// \return the maximized element count based on the targets vector
15101515
/// registers and the loop trip-count, but limited to a maximum safe VF.
15111516
/// This is a helper function of computeFeasibleMaxVF.
@@ -3855,6 +3860,38 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
38553860
Legal->hasVectorCallVariants())));
38563861
}
38573862

3863+
ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3864+
ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
3865+
unsigned EstimatedVF = VF.getKnownMinValue();
3866+
if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3867+
auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3868+
auto Min = Attr.getVScaleRangeMin();
3869+
EstimatedVF *= Min;
3870+
}
3871+
3872+
// When a scalar epilogue is required, at least one iteration of the scalar
3873+
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3874+
// max VF that results in a dead vector loop.
3875+
if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3876+
MaxTripCount -= 1;
3877+
3878+
if (MaxTripCount && MaxTripCount <= EstimatedVF &&
3879+
(!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3880+
// If upper bound loop trip count (TC) is known at compile time there is no
3881+
// point in choosing VF greater than TC (as done in the loop below). Select
3882+
// maximum power of two which doesn't exceed TC. If VF is
3883+
// scalable, we only fall back on a fixed VF when the TC is less than or
3884+
// equal to the known number of lanes.
3885+
auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3886+
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3887+
"exceeding the constant trip count: "
3888+
<< ClampedUpperTripCount << "\n");
3889+
return ElementCount::get(ClampedUpperTripCount,
3890+
FoldTailByMasking ? VF.isScalable() : false);
3891+
}
3892+
return VF;
3893+
}
3894+
38583895
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
38593896
unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
38603897
ElementCount MaxSafeVF, bool FoldTailByMasking) {
@@ -3886,40 +3923,16 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
38863923
return ElementCount::getFixed(1);
38873924
}
38883925

3889-
unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
3890-
if (MaxVectorElementCount.isScalable() &&
3891-
TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
3892-
auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
3893-
auto Min = Attr.getVScaleRangeMin();
3894-
WidestRegisterMinEC *= Min;
3895-
}
3896-
3897-
// When a scalar epilogue is required, at least one iteration of the scalar
3898-
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3899-
// max VF that results in a dead vector loop.
3900-
if (MaxTripCount > 0 && requiresScalarEpilogue(true))
3901-
MaxTripCount -= 1;
3902-
3903-
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
3904-
(!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
3905-
// If upper bound loop trip count (TC) is known at compile time there is no
3906-
// point in choosing VF greater than TC (as done in the loop below). Select
3907-
// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
3908-
// scalable, we only fall back on a fixed VF when the TC is less than or
3909-
// equal to the known number of lanes.
3910-
auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
3911-
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3912-
"exceeding the constant trip count: "
3913-
<< ClampedUpperTripCount << "\n");
3914-
return ElementCount::get(
3915-
ClampedUpperTripCount,
3916-
FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
3917-
}
3926+
ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount,
3927+
MaxTripCount, FoldTailByMasking);
3928+
// If the MaxVF was already clamped, there's no point in trying to pick a
3929+
// larger one.
3930+
if (MaxVF != MaxVectorElementCount)
3931+
return MaxVF;
39183932

39193933
TargetTransformInfo::RegisterKind RegKind =
39203934
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
39213935
: TargetTransformInfo::RGK_FixedWidthVector;
3922-
ElementCount MaxVF = MaxVectorElementCount;
39233936

39243937
if (MaxVF.isScalable())
39253938
MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
@@ -3941,10 +3954,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
39413954
}
39423955
}
39433956

3944-
// Invalidate any widening decisions we might have made, in case the loop
3945-
// requires prediction (decided later), but we have already made some
3946-
// load/store widening decisions.
3947-
invalidateCostModelingDecisions();
3957+
MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking);
3958+
3959+
if (MaxVectorElementCount != MaxVF) {
3960+
// Invalidate any widening decisions we might have made, in case the loop
3961+
// requires prediction (decided later), but we have already made some
3962+
// load/store widening decisions.
3963+
invalidateCostModelingDecisions();
3964+
}
39483965
}
39493966
return MaxVF;
39503967
}

llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -288,38 +288,32 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
288288
; CHECK-LABEL: define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
289289
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[F:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[I:%.*]], ptr noalias [[J:%.*]], ptr noalias [[K:%.*]], ptr [[L:%.*]]) #[[ATTR1:[0-9]+]] {
290290
; CHECK-NEXT: [[ENTRY:.*]]:
291-
; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
291+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
292292
; CHECK: [[VECTOR_PH]]:
293293
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
294294
; CHECK: [[VECTOR_BODY]]:
295-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[J]], i64 0
296-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP0]], align 8
297-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
298-
; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[STRIDED_VEC]] to <8 x i16>
299-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 0
300-
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 2
301-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 4
302-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 6
303-
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[K]], i64 8
304-
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[K]], i64 10
305-
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[K]], i64 12
306-
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[K]], i64 14
307-
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
308-
; CHECK-NEXT: store i16 [[TMP14]], ptr [[TMP6]], align 2
309-
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
310-
; CHECK-NEXT: store i16 [[TMP15]], ptr [[TMP7]], align 2
311-
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
312-
; CHECK-NEXT: store i16 [[TMP16]], ptr [[TMP8]], align 2
313-
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
314-
; CHECK-NEXT: store i16 [[TMP17]], ptr [[TMP9]], align 2
315-
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
316-
; CHECK-NEXT: store i16 [[TMP18]], ptr [[TMP10]], align 2
317-
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
318-
; CHECK-NEXT: store i16 [[TMP19]], ptr [[TMP11]], align 2
319-
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
320-
; CHECK-NEXT: store i16 [[TMP20]], ptr [[TMP12]], align 2
321-
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
322-
; CHECK-NEXT: store i16 [[TMP21]], ptr [[TMP13]], align 2
295+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
296+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
297+
; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0
298+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
299+
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
300+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
301+
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]]
302+
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
303+
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
304+
; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
305+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
306+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
307+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
308+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
309+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
310+
; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2
311+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
312+
; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2
313+
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
314+
; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2
315+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
316+
; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2
323317
; CHECK-NEXT: store i64 0, ptr [[A]], align 8
324318
; CHECK-NEXT: store i64 0, ptr [[B]], align 8
325319
; CHECK-NEXT: store i64 0, ptr [[C]], align 8
@@ -330,18 +324,20 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
330324
; CHECK-NEXT: store i64 0, ptr [[H]], align 8
331325
; CHECK-NEXT: store i64 0, ptr [[I]], align 8
332326
; CHECK-NEXT: store i64 0, ptr [[L]], align 8
333-
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
327+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
328+
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
329+
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
334330
; CHECK: [[MIDDLE_BLOCK]]:
335331
; CHECK-NEXT: br label %[[SCALAR_PH]]
336332
; CHECK: [[SCALAR_PH]]:
337-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
333+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
338334
; CHECK-NEXT: br label %[[LOOP:.*]]
339335
; CHECK: [[LOOP]]:
340-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
341-
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]]
342-
; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J]], align 8
336+
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
337+
; CHECK-NEXT: [[GEP_J1:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV1]]
338+
; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J1]], align 8
343339
; CHECK-NEXT: [[L_TRUNC:%.*]] = trunc i64 [[L_J]] to i16
344-
; CHECK-NEXT: [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
340+
; CHECK-NEXT: [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV1]]
345341
; CHECK-NEXT: store i16 [[L_TRUNC]], ptr [[GEP_K]], align 2
346342
; CHECK-NEXT: store i64 0, ptr [[A]], align 8
347343
; CHECK-NEXT: store i64 0, ptr [[B]], align 8
@@ -353,9 +349,9 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
353349
; CHECK-NEXT: store i64 0, ptr [[H]], align 8
354350
; CHECK-NEXT: store i64 0, ptr [[I]], align 8
355351
; CHECK-NEXT: store i64 0, ptr [[L]], align 8
356-
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 2
357-
; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 14
358-
; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]]
352+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2
353+
; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14
354+
; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
359355
; CHECK: [[EXIT]]:
360356
; CHECK-NEXT: ret void
361357
;

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1816,13 +1816,12 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
18161816
; CHECK-INTERLEAVE1-NEXT: entry:
18171817
; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
18181818
; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
1819-
; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
1820-
; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1819+
; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
18211820
; CHECK-INTERLEAVE1: vector.ph:
18221821
; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
18231822
; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
1824-
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
1825-
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
1823+
; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
1824+
; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
18261825
; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
18271826
; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
18281827
; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -1845,7 +1844,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
18451844
; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
18461845
; CHECK-INTERLEAVE1: middle.block:
18471846
; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
1848-
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
1847+
; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
18491848
; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
18501849
; CHECK-INTERLEAVE1: scalar.ph:
18511850
;
@@ -1854,13 +1853,13 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
18541853
; CHECK-INTERLEAVED-NEXT: entry:
18551854
; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
18561855
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
1857-
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
1856+
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
18581857
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
18591858
; CHECK-INTERLEAVED: vector.ph:
18601859
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
18611860
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
1862-
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
1863-
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
1861+
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
1862+
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
18641863
; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
18651864
; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
18661865
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -1897,7 +1896,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
18971896
; CHECK-INTERLEAVED: middle.block:
18981897
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
18991898
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
1900-
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
1899+
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
19011900
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
19021901
; CHECK-INTERLEAVED: scalar.ph:
19031902
;
@@ -1906,19 +1905,19 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
19061905
; CHECK-MAXBW-NEXT: entry:
19071906
; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
19081907
; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
1909-
; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
1908+
; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
19101909
; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
19111910
; CHECK-MAXBW: vector.ph:
19121911
; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
19131912
; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
1914-
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
1915-
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
1913+
; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
1914+
; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
19161915
; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
19171916
; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
19181917
; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
19191918
; CHECK-MAXBW: vector.body:
19201919
; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1921-
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
1920+
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
19221921
; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
19231922
; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
19241923
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
@@ -1927,15 +1926,15 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
19271926
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
19281927
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
19291928
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
1930-
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
1931-
; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
1932-
; CHECK-MAXBW-NEXT: [[TMP19]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP14]]
1929+
; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
1930+
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP17]], [[TMP9]]
1931+
; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
19331932
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
19341933
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
19351934
; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
19361935
; CHECK-MAXBW: middle.block:
1937-
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP19]])
1938-
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
1936+
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
1937+
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
19391938
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
19401939
; CHECK-MAXBW: scalar.ph:
19411940
;
@@ -1954,7 +1953,7 @@ for.body: ; preds = %entry, %for.body
19541953
%conv3 = zext i8 %1 to i64
19551954
%mul = mul nuw nsw i64 %conv3, %conv
19561955
%add = add i64 %sum, %mul
1957-
%exitcond.not = icmp eq i64 %i.iv.next, 16
1956+
%exitcond.not = icmp eq i64 %i.iv.next, 41
19581957
br i1 %exitcond.not, label %exit, label %for.body
19591958

19601959
exit: ; preds = %for.body

0 commit comments

Comments
 (0)