Skip to content

Commit b663e56

Browse files
authored
[VPlan] Fix header masks in EVL tail folding (#150202)
With EVL tail folding, the EVL may not always be VF on the second-to-last iteration. Recipes that have been converted to VP intrinsics via optimizeMaskToEVL account for this, but recipes that are left behind will still use the old header mask which may end up having a different vector length. This is effectively the same as #95368, and fixes this by converting header masks from icmp ule wide-canonical-iv, backedge-trip-count -> icmp ult step-vector, evl. Without it, recipes that fall through optimizeMaskToEVL may use the wrong vector length, e.g. in #150074 and #149981. We really need to split off optimizeMaskToEVL into VPlanTransforms::optimize and move transformRecipestoEVLRecipes into tryToBuildVPlanWithVPRecipes, so we don't mix up what is needed for correctness and what is needed to optimize away the mask computations. We should be able to still generate a correct albeit suboptimal VPlan without running optimizeMaskToEVL. I've added a TODO for this, which I think we can do after #148274 Fixes #150197
1 parent 9b23e2b commit b663e56

File tree

5 files changed

+43
-33
lines changed

5 files changed

+43
-33
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2244,6 +2244,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
22442244

22452245
// Try to optimize header mask recipes away to their EVL variants.
22462246
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
2247+
// TODO: Split optimizeMaskToEVL out and move into
2248+
// VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
2249+
// tryToBuildVPlanWithVPRecipes beforehand.
22472250
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
22482251
auto *CurRecipe = cast<VPRecipeBase>(U);
22492252
VPRecipeBase *EVLRecipe =
@@ -2265,6 +2268,20 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
22652268
}
22662269
ToErase.push_back(CurRecipe);
22672270
}
2271+
2272+
// Replace header masks with a mask equivalent to predicating by EVL:
2273+
//
2274+
// icmp ule widen-canonical-iv backedge-taken-count
2275+
// ->
2276+
// icmp ult step-vector, EVL
2277+
VPRecipeBase *EVLR = EVL.getDefiningRecipe();
2278+
VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
2279+
Type *EVLType = TypeInfo.inferScalarType(&EVL);
2280+
VPValue *EVLMask = Builder.createICmp(
2281+
CmpInst::ICMP_ULT,
2282+
Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
2283+
HeaderMask->replaceAllUsesWith(EVLMask);
2284+
ToErase.push_back(HeaderMask->getDefiningRecipe());
22682285
}
22692286

22702287
for (VPRecipeBase *R : reverse(ToErase)) {

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
171171
.Case<VPInstructionWithType>(
172172
[&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
173173
.Case<VPInstruction>([&](const VPInstruction *I) {
174-
if (I->getOpcode() == Instruction::PHI)
174+
if (I->getOpcode() == Instruction::PHI ||
175+
I->getOpcode() == Instruction::ICmp)
175176
return VerifyEVLUse(*I, 1);
176177
switch (I->getOpcode()) {
177178
case Instruction::Add:

llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,13 @@ define void @test(ptr %p, i64 %a, i8 %b) {
3535
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ]
3636
; CHECK-NEXT: [[AVL:%.*]] = sub i32 9, [[EVL_BASED_IV]]
3737
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true)
38-
; CHECK-NEXT: [[TMP12:%.*]] = mul i32 1, [[TMP11]]
39-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP12]], i64 0
38+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP11]], i64 0
4039
; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
41-
; CHECK-NEXT: [[TMP13:%.*]] = icmp ule <vscale x 2 x i32> [[VEC_IND]], splat (i32 8)
40+
; CHECK-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP11]]
41+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP20]], i64 0
42+
; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
43+
; CHECK-NEXT: [[TMP19:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
44+
; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <vscale x 2 x i32> [[TMP19]], [[BROADCAST_SPLAT6]]
4245
; CHECK-NEXT: [[TMP14:%.*]] = icmp sge <vscale x 2 x i32> [[VEC_IND]], splat (i32 2)
4346
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
4447
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i32> [[TMP7]], <vscale x 2 x i32> [[TMP8]]
@@ -47,7 +50,7 @@ define void @test(ptr %p, i64 %a, i8 %b) {
4750
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> [[TMP17]], <vscale x 2 x ptr> align 1 [[BROADCAST_SPLAT4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
4851
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP11]], [[EVL_BASED_IV]]
4952
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
50-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT6]]
53+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT8]]
5154
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
5255
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
5356
; CHECK: middle.block:

llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -254,25 +254,20 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
254254
; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
255255
; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
256256
; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
257-
; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
258257
; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
259258
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
260259
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
261-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
262-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
263260
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
264261
; IF-EVL-OUTLOOP: vector.body:
265262
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
266263
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
267264
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
268265
; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
269266
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true)
270-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
271-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
272-
; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
273-
; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
274-
; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
275-
; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
267+
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
268+
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
269+
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
270+
; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ult <vscale x 4 x i32> [[TMP12]], [[BROADCAST_SPLAT]]
276271
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
277272
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
278273
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sle <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
@@ -774,35 +769,27 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
774769
; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
775770
; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
776771
; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
777-
; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
778772
; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
779773
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
780774
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
781-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
782-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
783-
; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
784-
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1)
785-
; IF-EVL-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
786-
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
787-
; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
775+
; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
776+
; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP10]], splat (i32 1)
788777
; IF-EVL-OUTLOOP-NEXT: [[INDUCTION1:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP13]]
789778
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
790779
; IF-EVL-OUTLOOP: vector.body:
791780
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
792781
; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
793-
; IF-EVL-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
794782
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
795783
; IF-EVL-OUTLOOP-NEXT: [[VEC_IND2:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VECTOR_BODY]] ]
796784
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[IV]]
797785
; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
798-
; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = mul i32 1, [[TMP14]]
799-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP15]], i64 0
786+
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
800787
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
801-
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
802-
; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP16]]
803-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
804-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
805-
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
788+
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP14]]
789+
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
790+
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
791+
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
792+
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ult <vscale x 4 x i32> [[TMP12]], [[BROADCAST_SPLAT4]]
806793
; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
807794
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
808795
; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp sle <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND2]]
@@ -813,8 +800,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
813800
; IF-EVL-OUTLOOP-NEXT: [[TMP25:%.*]] = zext i32 [[TMP14]] to i64
814801
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[IV]]
815802
; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
816-
; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
817-
; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT7]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT4]]
803+
; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT7]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT2]]
818804
; IF-EVL-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
819805
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
820806
; IF-EVL-OUTLOOP: middle.block:

llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,11 +415,14 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
415415
; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
416416
; TF-SCALABLE-NEXT: [[AVL:%.*]] = sub i64 1025, [[INDEX]]
417417
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
418+
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
419+
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
418420
; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
419421
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]]
420422
; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
421423
; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
422-
; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], splat (i64 1024)
424+
; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
425+
; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult <vscale x 4 x i32> [[TMP16]], [[BROADCAST_SPLAT4]]
423426
; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt <vscale x 4 x i64> [[VEC_IND]], splat (i64 10)
424427
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
425428
; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP7]])

0 commit comments

Comments
 (0)