Skip to content

Commit aa6367a

Browse files
committed
[VPlan] Introduce CSE pass
1 parent 5cedb01 commit aa6367a

24 files changed

+177
-113
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7286,6 +7286,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72867286
VPlanTransforms::narrowInterleaveGroups(
72877287
BestVPlan, BestVF,
72887288
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7289+
VPlanTransforms::cse(BestVPlan);
72897290
VPlanTransforms::removeDeadRecipes(BestVPlan);
72907291

72917292
VPlanTransforms::convertToConcreteRecipes(BestVPlan);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -792,8 +792,8 @@ class VPIRFlags {
792792
}
793793

794794
CmpInst::Predicate getPredicate() const {
795-
assert(OpType == OperationType::Cmp &&
796-
"recipe doesn't have a compare predicate");
795+
if (OpType != OperationType::Cmp)
796+
return CmpInst::BAD_ICMP_PREDICATE;
797797
return CmpPredicate;
798798
}
799799

@@ -897,6 +897,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
897897
return R && classof(R);
898898
}
899899

900+
static inline bool classof(const VPSingleDefRecipe *U) {
901+
auto *R = dyn_cast<VPRecipeBase>(U);
902+
return R && classof(R);
903+
}
904+
900905
void execute(VPTransformState &State) override = 0;
901906
};
902907

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,6 +1766,113 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
17661766
}
17671767
}
17681768

1769+
namespace {
1770+
struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1771+
static bool isSentinel(const VPSingleDefRecipe *Def) {
1772+
return Def == getEmptyKey() || Def == getTombstoneKey();
1773+
}
1774+
1775+
/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1776+
/// Returns an optional pair, where the first element indicates whether it is
1777+
/// an intrinsic ID.
1778+
static std::optional<std::pair<bool, unsigned>>
1779+
getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1780+
return TypeSwitch<const VPSingleDefRecipe *,
1781+
std::optional<std::pair<bool, unsigned>>>(R)
1782+
.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1783+
VPWidenSelectRecipe, VPHistogramRecipe, VPPartialReductionRecipe,
1784+
VPReplicateRecipe>(
1785+
[](auto *I) { return std::make_pair(false, I->getOpcode()); })
1786+
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
1787+
return std::make_pair(true, I->getVectorIntrinsicID());
1788+
})
1789+
.Default([](auto *) { return std::nullopt; });
1790+
}
1791+
1792+
/// During CSE, we can only handle certain recipes that don't read from
1793+
/// memory: if they read from memory, there could be an intervening write to
1794+
/// memory before the next instance is CSE'd, leading to an incorrect result.
1795+
/// We can extend the list of handled recipes in the future, provided we
1796+
/// account for the data embedded in them while checking for equality or
1797+
/// hashing.
1798+
static bool canHandle(const VPSingleDefRecipe *Def) {
1799+
return isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1800+
VPWidenSelectRecipe, VPHistogramRecipe, VPReplicateRecipe,
1801+
VPWidenIntrinsicRecipe>(Def) &&
1802+
!Def->mayReadFromMemory();
1803+
}
1804+
1805+
/// Hash the underlying data of \p Def.
1806+
static unsigned getHashValue(const VPSingleDefRecipe *Def) {
1807+
const VPlan *Plan = Def->getParent()->getPlan();
1808+
VPTypeAnalysis TypeInfo(*Plan);
1809+
hash_code Result = hash_combine(
1810+
Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
1811+
TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
1812+
hash_combine_range(Def->operands()));
1813+
if (isa<VPReplicateRecipe>(Def))
1814+
return hash_combine(Result, Def->getUnderlyingInstr());
1815+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
1816+
return hash_combine(Result, RFlags->getPredicate());
1817+
return Result;
1818+
}
1819+
1820+
/// Check equality of underlying data of \p L and \p R.
1821+
static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
1822+
if (isSentinel(L) || isSentinel(R))
1823+
return L == R;
1824+
const VPlan *Plan = L->getParent()->getPlan();
1825+
VPTypeAnalysis TypeInfo(*Plan);
1826+
bool Result = L->getVPDefID() == R->getVPDefID() &&
1827+
getOpcodeOrIntrinsicID(L) == getOpcodeOrIntrinsicID(R) &&
1828+
TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R) &&
1829+
vputils::isSingleScalar(L) == vputils::isSingleScalar(R) &&
1830+
equal(L->operands(), R->operands());
1831+
if (Result && isa<VPReplicateRecipe>(L))
1832+
Result = L->getUnderlyingInstr() == R->getUnderlyingInstr();
1833+
if (Result && isa<VPRecipeWithIRFlags>(L))
1834+
Result = cast<VPRecipeWithIRFlags>(L)->getPredicate() ==
1835+
cast<VPRecipeWithIRFlags>(R)->getPredicate();
1836+
assert((!Result || getHashValue(L) == getHashValue(R)) &&
1837+
"Divergent hashes of equal values");
1838+
return Result;
1839+
}
1840+
};
1841+
} // end anonymous namespace
1842+
1843+
/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
1844+
/// Plan.
1845+
void VPlanTransforms::cse(VPlan &Plan) {
1846+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1847+
if (!LoopRegion)
1848+
return;
1849+
auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
1850+
vp_depth_first_shallow(Plan.getEntry()));
1851+
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
1852+
vp_depth_first_shallow(LoopRegion->getEntry()));
1853+
1854+
// There is existing logic to sink instructions into replicate regions, and
1855+
// we'd be undoing that work if we went through replicate regions. Hence,
1856+
// don't CSE in replicate regions.
1857+
DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
1858+
for (VPBasicBlock *VPBB :
1859+
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
1860+
for (VPRecipeBase &R : *VPBB) {
1861+
auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1862+
if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
1863+
continue;
1864+
if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
1865+
// Drop poison-generating flags when reusing a value.
1866+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
1867+
RFlags->dropPoisonGeneratingFlags();
1868+
Def->replaceAllUsesWith(V);
1869+
continue;
1870+
}
1871+
CSEMap[Def] = Def;
1872+
}
1873+
}
1874+
}
1875+
17691876
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
17701877
static void licm(VPlan &Plan) {
17711878
VPBasicBlock *Preheader = Plan.getVectorPreheader();

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,10 @@ struct VPlanTransforms {
246246
/// removing dead edges to their successors.
247247
static void removeBranchOnConst(VPlan &Plan);
248248

249+
/// Perform common-subexpression-elimination, which is best done after the \p
250+
/// Plan is executed.
251+
static void cse(VPlan &Plan);
252+
249253
/// If there's a single exit block, optimize its phi recipes that use exiting
250254
/// IV values by feeding them precomputed end values instead, possibly taken
251255
/// one step backwards.

llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,10 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
330330
; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[IND_END]]
331331
; CHECK-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
332332
; CHECK: vec.epilog.scalar.ph:
333-
; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
334-
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
333+
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
335334
; CHECK-NEXT: br label [[LOOP:%.*]]
336335
; CHECK: loop:
337-
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
336+
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
338337
; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT]], [[LOOP]] ]
339338
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_1]]
340339
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV_2]], 10

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
4444
; DEFAULT-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
4545
; DEFAULT-NEXT: [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
4646
; DEFAULT-NEXT: [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
47-
; DEFAULT-NEXT: [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
48-
; DEFAULT-NEXT: [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
49-
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
50-
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
47+
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP26]]
48+
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP27]]
5149
; DEFAULT-NEXT: [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
5250
; DEFAULT-NEXT: [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
5351
; DEFAULT-NEXT: [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
@@ -118,8 +116,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
118116
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
119117
; PRED-NEXT: [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
120118
; PRED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
121-
; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
122-
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP24]]
119+
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP17]]
123120
; PRED-NEXT: [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
124121
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
125122
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]

llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
1919
; CHECK: [[VECTOR_PH]]:
2020
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
2121
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
22-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2322
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2423
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
2524
; CHECK: [[VECTOR_BODY]]:
@@ -29,7 +28,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
2928
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
3029
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
3130
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
32-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
31+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
3332
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
3433
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
3534
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP8]], align 1
@@ -58,7 +57,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
5857
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
5958
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
6059
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
61-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6260
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6361
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
6462
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -75,7 +73,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
7573
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
7674
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
7775
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
78-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
76+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
7977
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
8078
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
8179
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1
@@ -154,7 +152,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
154152
; CHECK: [[VECTOR_PH]]:
155153
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
156154
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
157-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
158155
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
159156
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
160157
; CHECK: [[VECTOR_BODY]]:
@@ -164,7 +161,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
164161
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
165162
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
166163
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
167-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
164+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
168165
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
169166
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
170167
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP26]], align 1
@@ -193,7 +190,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
193190
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
194191
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
195192
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
196-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
197193
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
198194
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
199195
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -210,7 +206,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
210206
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
211207
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
212208
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
213-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
209+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
214210
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
215211
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
216212
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1

llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ define void @licm_replicate_call(double %x, ptr %dst) {
99
; CHECK-NEXT: [[ENTRY:.*]]:
1010
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1111
; CHECK: [[VECTOR_PH]]:
12-
; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
1312
; CHECK-NEXT: [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
13+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
1514
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
1615
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1716
; CHECK: [[VECTOR_BODY]]:

0 commit comments

Comments
 (0)