Skip to content

Commit 039adf6

Browse files
committed
[VPlan] Introduce CSE pass
1 parent 704dee2 commit 039adf6

28 files changed

+197
-132
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7291,6 +7291,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72917291
VPlanTransforms::narrowInterleaveGroups(
72927292
BestVPlan, BestVF,
72937293
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7294+
VPlanTransforms::cse(BestVPlan);
72947295
VPlanTransforms::removeDeadRecipes(BestVPlan);
72957296

72967297
VPlanTransforms::convertToConcreteRecipes(BestVPlan);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
900900
return R && classof(R);
901901
}
902902

903+
static inline bool classof(const VPSingleDefRecipe *U) {
904+
auto *R = dyn_cast<VPRecipeBase>(U);
905+
return R && classof(R);
906+
}
907+
903908
void execute(VPTransformState &State) override = 0;
904909

905910
/// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1792,6 +1792,122 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
17921792
}
17931793
}
17941794

1795+
namespace {
1796+
struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1797+
static bool isSentinel(const VPSingleDefRecipe *Def) {
1798+
return Def == getEmptyKey() || Def == getTombstoneKey();
1799+
}
1800+
1801+
/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1802+
/// Returns an optional pair, where the first element indicates whether it is
1803+
/// an intrinsic ID.
1804+
static std::optional<std::pair<bool, unsigned>>
1805+
getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1806+
return TypeSwitch<const VPSingleDefRecipe *,
1807+
std::optional<std::pair<bool, unsigned>>>(R)
1808+
.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1809+
VPWidenSelectRecipe, VPHistogramRecipe, VPPartialReductionRecipe,
1810+
VPReplicateRecipe>(
1811+
[](auto *I) { return std::make_pair(false, I->getOpcode()); })
1812+
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
1813+
return std::make_pair(true, I->getVectorIntrinsicID());
1814+
})
1815+
.Default([](auto *) { return std::nullopt; });
1816+
}
1817+
1818+
/// During CSE, we can only handle certain recipes that don't read from
1819+
/// memory: if they read from memory, there could be an intervening write to
1820+
/// memory before the next instance is CSE'd, leading to an incorrect result.
1821+
/// We can extend the list of handled recipes in the future, provided we
1822+
/// account for the data embedded in them while checking for equality or
1823+
/// hashing.
1824+
static bool canHandle(const VPSingleDefRecipe *Def) {
1825+
// The issue with (Insert|Extract)Value is that the index of the
1826+
// insert/extract is not a proper operand in LLVM IR, and hence also not in
1827+
// VPlan.
1828+
if (auto C = getOpcodeOrIntrinsicID(Def))
1829+
if (!C->first && (C->second == Instruction::InsertValue ||
1830+
C->second == Instruction::ExtractValue))
1831+
return false;
1832+
return isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1833+
VPWidenSelectRecipe, VPHistogramRecipe, VPReplicateRecipe,
1834+
VPWidenIntrinsicRecipe>(Def) &&
1835+
!Def->mayReadFromMemory();
1836+
}
1837+
1838+
/// Hash the underlying data of \p Def.
1839+
static unsigned getHashValue(const VPSingleDefRecipe *Def) {
1840+
const VPlan *Plan = Def->getParent()->getPlan();
1841+
VPTypeAnalysis TypeInfo(*Plan);
1842+
hash_code Result = hash_combine(
1843+
Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
1844+
TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
1845+
hash_combine_range(Def->operands()));
1846+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
1847+
if (RFlags->hasPredicate())
1848+
return hash_combine(Result, RFlags->getPredicate());
1849+
return Result;
1850+
}
1851+
1852+
/// Check equality of underlying data of \p L and \p R.
1853+
static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
1854+
if (isSentinel(L) || isSentinel(R))
1855+
return L == R;
1856+
const VPlan *Plan = L->getParent()->getPlan();
1857+
VPTypeAnalysis TypeInfo(*Plan);
1858+
bool Result = L->getVPDefID() == R->getVPDefID() &&
1859+
getOpcodeOrIntrinsicID(L) == getOpcodeOrIntrinsicID(R) &&
1860+
TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R) &&
1861+
vputils::isSingleScalar(L) == vputils::isSingleScalar(R) &&
1862+
equal(L->operands(), R->operands());
1863+
if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
1864+
if (Result && LFlags->hasPredicate())
1865+
Result = LFlags->getPredicate() ==
1866+
cast<VPRecipeWithIRFlags>(R)->getPredicate();
1867+
assert((!Result || getHashValue(L) == getHashValue(R)) &&
1868+
"Divergent hashes of equal values");
1869+
return Result;
1870+
}
1871+
};
1872+
} // end anonymous namespace
1873+
1874+
/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
1875+
/// Plan.
1876+
void VPlanTransforms::cse(VPlan &Plan) {
1877+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1878+
if (!LoopRegion)
1879+
return;
1880+
auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
1881+
vp_depth_first_shallow(Plan.getEntry()));
1882+
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
1883+
vp_depth_first_shallow(LoopRegion->getEntry()));
1884+
VPDominatorTree VPDT(Plan);
1885+
1886+
// There is existing logic to sink instructions into replicate regions, and
1887+
// we'd be undoing that work if we went through replicate regions. Hence,
1888+
// don't CSE in replicate regions.
1889+
DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
1890+
for (VPBasicBlock *VPBB :
1891+
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
1892+
for (VPRecipeBase &R : *VPBB) {
1893+
auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1894+
if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
1895+
continue;
1896+
if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
1897+
// V must dominate Def for a valid replacement.
1898+
if (!VPDT.dominates(V->getParent(), VPBB))
1899+
continue;
1900+
// Drop poison-generating flags when reusing a value.
1901+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
1902+
RFlags->dropPoisonGeneratingFlags();
1903+
Def->replaceAllUsesWith(V);
1904+
continue;
1905+
}
1906+
CSEMap[Def] = Def;
1907+
}
1908+
}
1909+
}
1910+
17951911
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
17961912
static void licm(VPlan &Plan) {
17971913
VPBasicBlock *Preheader = Plan.getVectorPreheader();

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,10 @@ struct VPlanTransforms {
242242
/// removing dead edges to their successors.
243243
static void removeBranchOnConst(VPlan &Plan);
244244

245+
/// Perform common-subexpression-elimination, which is best done after the \p
246+
/// Plan is executed.
247+
static void cse(VPlan &Plan);
248+
245249
/// If there's a single exit block, optimize its phi recipes that use exiting
246250
/// IV values by feeding them precomputed end values instead, possibly taken
247251
/// one step backwards.

llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,11 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
5454
; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
5555
; CHECK: [[PRED_LOAD_CONTINUE6]]:
5656
; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x double> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
57-
; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], -1
58-
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP25]]
59-
; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP26]], i32 8, <4 x i1> [[TMP4]])
57+
; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP4]])
6058
; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq <4 x double> [[TMP24]], zeroinitializer
6159
; CHECK-NEXT: [[TMP29:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
6260
; CHECK-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP5]], [[TMP29]]
63-
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP29]], i32 0
64-
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP31]], i64 [[TMP25]], i64 [[TMP6]]
65-
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[PREDPHI]]
61+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[TMP6]]
6662
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP32]], i32 4, <4 x i1> [[TMP30]])
6763
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
6864
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000

llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,10 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
330330
; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[IND_END]]
331331
; CHECK-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
332332
; CHECK: vec.epilog.scalar.ph:
333-
; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
334-
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
333+
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
335334
; CHECK-NEXT: br label [[LOOP:%.*]]
336335
; CHECK: loop:
337-
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
336+
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
338337
; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT]], [[LOOP]] ]
339338
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_1]]
340339
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV_2]], 10

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
4444
; DEFAULT-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
4545
; DEFAULT-NEXT: [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
4646
; DEFAULT-NEXT: [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
47-
; DEFAULT-NEXT: [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
48-
; DEFAULT-NEXT: [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
49-
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
50-
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
47+
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP26]]
48+
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP27]]
5149
; DEFAULT-NEXT: [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
5250
; DEFAULT-NEXT: [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
5351
; DEFAULT-NEXT: [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
@@ -118,8 +116,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
118116
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
119117
; PRED-NEXT: [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
120118
; PRED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
121-
; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
122-
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP24]]
119+
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP17]]
123120
; PRED-NEXT: [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
124121
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
125122
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]

llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
1919
; CHECK: [[VECTOR_PH]]:
2020
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
2121
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
22-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2322
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2423
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
2524
; CHECK: [[VECTOR_BODY]]:
@@ -29,7 +28,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
2928
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
3029
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
3130
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
32-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
31+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
3332
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
3433
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
3534
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP8]], align 1
@@ -58,7 +57,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
5857
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
5958
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
6059
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
61-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6260
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6361
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
6462
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -75,7 +73,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
7573
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
7674
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
7775
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
78-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
76+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
7977
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
8078
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
8179
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1
@@ -154,7 +152,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
154152
; CHECK: [[VECTOR_PH]]:
155153
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
156154
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
157-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
158155
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
159156
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
160157
; CHECK: [[VECTOR_BODY]]:
@@ -164,7 +161,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
164161
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
165162
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
166163
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
167-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
164+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
168165
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
169166
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
170167
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP26]], align 1
@@ -193,7 +190,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
193190
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
194191
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
195192
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
196-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
197193
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
198194
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
199195
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -210,7 +206,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
210206
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
211207
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
212208
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
213-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
209+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
214210
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
215211
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
216212
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1

llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ define void @licm_replicate_call(double %x, ptr %dst) {
99
; CHECK-NEXT: [[ENTRY:.*]]:
1010
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1111
; CHECK: [[VECTOR_PH]]:
12-
; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
1312
; CHECK-NEXT: [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
13+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
1514
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
1615
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1716
; CHECK: [[VECTOR_BODY]]:

0 commit comments

Comments
 (0)