Skip to content

Commit 1d7c076

Browse files
committed
[VPlan] Introduce CSE pass
1 parent 2975e67 commit 1d7c076

28 files changed

+205
-138
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7291,6 +7291,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72917291
VPlanTransforms::narrowInterleaveGroups(
72927292
BestVPlan, BestVF,
72937293
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7294+
VPlanTransforms::cse(BestVPlan);
72947295
VPlanTransforms::removeDeadRecipes(BestVPlan);
72957296

72967297
VPlanTransforms::convertToConcreteRecipes(BestVPlan);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,9 @@ class VPIRFlags {
805805

806806
GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; }
807807

808+
/// Returns true if recipe has a comparison predicate.
809+
bool hasPredicate() const { return OpType == OperationType::Cmp; }
810+
808811
/// Returns true if the recipe has fast-math flags.
809812
bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
810813

@@ -897,6 +900,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
897900
return R && classof(R);
898901
}
899902

903+
static inline bool classof(const VPSingleDefRecipe *U) {
904+
auto *R = dyn_cast<VPRecipeBase>(U);
905+
return R && classof(R);
906+
}
907+
900908
void execute(VPTransformState &State) override = 0;
901909

902910
/// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1795,6 +1795,122 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
17951795
}
17961796
}
17971797

1798+
namespace {
1799+
struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1800+
static bool isSentinel(const VPSingleDefRecipe *Def) {
1801+
return Def == getEmptyKey() || Def == getTombstoneKey();
1802+
}
1803+
1804+
/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1805+
/// Returns an optional pair, where the first element indicates whether it is
1806+
/// an intrinsic ID.
1807+
static std::optional<std::pair<bool, unsigned>>
1808+
getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1809+
return TypeSwitch<const VPSingleDefRecipe *,
1810+
std::optional<std::pair<bool, unsigned>>>(R)
1811+
.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1812+
VPWidenSelectRecipe, VPHistogramRecipe, VPPartialReductionRecipe,
1813+
VPReplicateRecipe>(
1814+
[](auto *I) { return std::make_pair(false, I->getOpcode()); })
1815+
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
1816+
return std::make_pair(true, I->getVectorIntrinsicID());
1817+
})
1818+
.Default([](auto *) { return std::nullopt; });
1819+
}
1820+
1821+
/// During CSE, we can only handle certain recipes that don't read from
1822+
/// memory: if they read from memory, there could be an intervening write to
1823+
/// memory before the next instance is CSE'd, leading to an incorrect result.
1824+
/// We can extend the list of handled recipes in the future, provided we
1825+
/// account for the data embedded in them while checking for equality or
1826+
/// hashing.
1827+
static bool canHandle(const VPSingleDefRecipe *Def) {
1828+
// The issue with (Insert|Extract)Value is that the index of the
1829+
// insert/extract is not a proper operand in LLVM IR, and hence also not in
1830+
// VPlan.
1831+
if (auto C = getOpcodeOrIntrinsicID(Def))
1832+
if (!C->first && (C->second == Instruction::InsertValue ||
1833+
C->second == Instruction::ExtractValue))
1834+
return false;
1835+
return isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1836+
VPWidenSelectRecipe, VPHistogramRecipe, VPReplicateRecipe,
1837+
VPWidenIntrinsicRecipe>(Def) &&
1838+
!Def->mayReadFromMemory();
1839+
}
1840+
1841+
/// Hash the underlying data of \p Def.
1842+
static unsigned getHashValue(const VPSingleDefRecipe *Def) {
1843+
const VPlan *Plan = Def->getParent()->getPlan();
1844+
VPTypeAnalysis TypeInfo(*Plan);
1845+
hash_code Result = hash_combine(
1846+
Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
1847+
TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
1848+
hash_combine_range(Def->operands()));
1849+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
1850+
if (RFlags->hasPredicate())
1851+
return hash_combine(Result, RFlags->getPredicate());
1852+
return Result;
1853+
}
1854+
1855+
/// Check equality of underlying data of \p L and \p R.
1856+
static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
1857+
if (isSentinel(L) || isSentinel(R))
1858+
return L == R;
1859+
const VPlan *Plan = L->getParent()->getPlan();
1860+
VPTypeAnalysis TypeInfo(*Plan);
1861+
bool Result = L->getVPDefID() == R->getVPDefID() &&
1862+
getOpcodeOrIntrinsicID(L) == getOpcodeOrIntrinsicID(R) &&
1863+
TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R) &&
1864+
vputils::isSingleScalar(L) == vputils::isSingleScalar(R) &&
1865+
equal(L->operands(), R->operands());
1866+
if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
1867+
if (Result && LFlags->hasPredicate())
1868+
Result = LFlags->getPredicate() ==
1869+
cast<VPRecipeWithIRFlags>(R)->getPredicate();
1870+
assert((!Result || getHashValue(L) == getHashValue(R)) &&
1871+
"Divergent hashes of equal values");
1872+
return Result;
1873+
}
1874+
};
1875+
} // end anonymous namespace
1876+
1877+
/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
1878+
/// Plan.
1879+
void VPlanTransforms::cse(VPlan &Plan) {
1880+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1881+
if (!LoopRegion)
1882+
return;
1883+
auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
1884+
vp_depth_first_shallow(Plan.getEntry()));
1885+
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
1886+
vp_depth_first_shallow(LoopRegion->getEntry()));
1887+
VPDominatorTree VPDT(Plan);
1888+
1889+
// There is existing logic to sink instructions into replicate regions, and
1890+
// we'd be undoing that work if we went through replicate regions. Hence,
1891+
// don't CSE in replicate regions.
1892+
DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
1893+
for (VPBasicBlock *VPBB :
1894+
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
1895+
for (VPRecipeBase &R : *VPBB) {
1896+
auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1897+
if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
1898+
continue;
1899+
if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
1900+
// V must dominate Def for a valid replacement.
1901+
if (!VPDT.dominates(V->getParent(), VPBB))
1902+
continue;
1903+
// Drop poison-generating flags when reusing a value.
1904+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
1905+
RFlags->dropPoisonGeneratingFlags();
1906+
Def->replaceAllUsesWith(V);
1907+
continue;
1908+
}
1909+
CSEMap[Def] = Def;
1910+
}
1911+
}
1912+
}
1913+
17981914
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
17991915
static void licm(VPlan &Plan) {
18001916
VPBasicBlock *Preheader = Plan.getVectorPreheader();

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,10 @@ struct VPlanTransforms {
246246
/// removing dead edges to their successors.
247247
static void removeBranchOnConst(VPlan &Plan);
248248

249+
/// Perform common-subexpression-elimination, which is best done after the \p
250+
/// Plan is executed.
251+
static void cse(VPlan &Plan);
252+
249253
/// If there's a single exit block, optimize its phi recipes that use exiting
250254
/// IV values by feeding them precomputed end values instead, possibly taken
251255
/// one step backwards.

llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,11 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
5454
; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
5555
; CHECK: [[PRED_LOAD_CONTINUE6]]:
5656
; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x double> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
57-
; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], -1
58-
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP25]]
59-
; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP26]], i32 8, <4 x i1> [[TMP4]])
57+
; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP4]])
6058
; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq <4 x double> [[TMP24]], zeroinitializer
6159
; CHECK-NEXT: [[TMP29:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
6260
; CHECK-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP5]], [[TMP29]]
63-
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP29]], i32 0
64-
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP31]], i64 [[TMP25]], i64 [[TMP6]]
65-
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[PREDPHI]]
61+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[TMP6]]
6662
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP32]], i32 4, <4 x i1> [[TMP30]])
6763
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
6864
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000

llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,10 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
330330
; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[IND_END]]
331331
; CHECK-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
332332
; CHECK: vec.epilog.scalar.ph:
333-
; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
334-
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
333+
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
335334
; CHECK-NEXT: br label [[LOOP:%.*]]
336335
; CHECK: loop:
337-
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
336+
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
338337
; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT]], [[LOOP]] ]
339338
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_1]]
340339
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV_2]], 10

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
4444
; DEFAULT-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
4545
; DEFAULT-NEXT: [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
4646
; DEFAULT-NEXT: [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
47-
; DEFAULT-NEXT: [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
48-
; DEFAULT-NEXT: [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
49-
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
50-
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
47+
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP26]]
48+
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP27]]
5149
; DEFAULT-NEXT: [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
5250
; DEFAULT-NEXT: [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
5351
; DEFAULT-NEXT: [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
@@ -118,8 +116,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
118116
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
119117
; PRED-NEXT: [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
120118
; PRED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
121-
; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
122-
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP24]]
119+
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP17]]
123120
; PRED-NEXT: [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
124121
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
125122
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]

llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
1919
; CHECK: [[VECTOR_PH]]:
2020
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
2121
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
22-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2322
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2423
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
2524
; CHECK: [[VECTOR_BODY]]:
@@ -29,7 +28,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
2928
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
3029
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
3130
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
32-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
31+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
3332
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
3433
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
3534
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP8]], align 1
@@ -58,7 +57,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
5857
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
5958
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
6059
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
61-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6260
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6361
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
6462
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -75,7 +73,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
7573
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
7674
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
7775
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
78-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
76+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
7977
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
8078
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
8179
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1
@@ -154,7 +152,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
154152
; CHECK: [[VECTOR_PH]]:
155153
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
156154
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
157-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
158155
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
159156
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
160157
; CHECK: [[VECTOR_BODY]]:
@@ -164,7 +161,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
164161
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
165162
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
166163
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
167-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
164+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
168165
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
169166
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
170167
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP26]], align 1
@@ -193,7 +190,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
193190
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
194191
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
195192
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
196-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
197193
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
198194
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
199195
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -210,7 +206,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
210206
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
211207
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
212208
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
213-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
209+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
214210
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
215211
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
216212
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1

llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ define void @licm_replicate_call(double %x, ptr %dst) {
99
; CHECK-NEXT: [[ENTRY:.*]]:
1010
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1111
; CHECK: [[VECTOR_PH]]:
12-
; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
1312
; CHECK-NEXT: [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
13+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
1514
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
1615
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1716
; CHECK: [[VECTOR_BODY]]:

0 commit comments

Comments
 (0)