Skip to content

Commit 2227361

Browse files
committed
[VPlan] Introduce CSE pass
Requires #151487 to completely subsume the non-VPlan based limited CSE. Inspired by #146856, although the test from that PR remains unchanged: still investigating.
1 parent c3a404a commit 2227361

24 files changed

+195
-155
lines changed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,36 @@ void VPDef::dump() const {
122122
}
123123
#endif
124124

125+
bool VPValue::isIdenticalTo(const VPValue *Other) const {
126+
if (getVPValueID() != Other->getVPValueID() ||
127+
hasDefiningRecipe() != Other->hasDefiningRecipe() ||
128+
getUnderlyingValue() != Other->getUnderlyingValue())
129+
return false;
130+
if (hasDefiningRecipe()) {
131+
const VPRecipeBase *DefL = getDefiningRecipe();
132+
const VPRecipeBase *DefR = Other->getDefiningRecipe();
133+
if (vputils::getOpcode(*DefL) != vputils::getOpcode(*DefR) ||
134+
vputils::getResultTypeData(*DefL) !=
135+
vputils::getResultTypeData(*DefR) ||
136+
DefL->getNumOperands() != DefR->getNumOperands())
137+
return false;
138+
return equal(DefL->operands(), DefR->operands());
139+
}
140+
return true;
141+
}
142+
143+
hash_code llvm::hash_value(const VPValue &V) {
144+
if (V.hasDefiningRecipe()) {
145+
const VPRecipeBase *Def = V.getDefiningRecipe();
146+
return hash_combine(vputils::getOpcode(*Def),
147+
vputils::getResultTypeData(*Def),
148+
hash_combine_range(Def->operands()));
149+
}
150+
if (Value *U = V.getUnderlyingValue())
151+
return hash_combine(V.getVPValueID(), U);
152+
return hash_value(V.getVPValueID());
153+
}
154+
125155
VPRecipeBase *VPValue::getDefiningRecipe() {
126156
return cast_or_null<VPRecipeBase>(Def);
127157
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1755,6 +1755,51 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
17551755
}
17561756
}
17571757

1758+
/// Hash the underlying data of a VPSingleDefRecipe pointer, instead of hashing
1759+
/// the pointer itself.
1760+
namespace {
1761+
struct CSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1762+
static unsigned getHashValue(const VPSingleDefRecipe *R) {
1763+
return hash_value(*R);
1764+
}
1765+
1766+
static bool isEqual(const VPSingleDefRecipe *LHS,
1767+
const VPSingleDefRecipe *RHS) {
1768+
if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
1769+
LHS == getTombstoneKey() || RHS == getTombstoneKey())
1770+
return LHS == RHS;
1771+
return LHS->isIdenticalTo(RHS);
1772+
}
1773+
};
1774+
} // end anonymous namespace
1775+
1776+
/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
1777+
/// Plan.
1778+
static void cse(VPlan &Plan, Type &CanonicalIVTy) {
1779+
DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, CSEDenseMapInfo> CSEMap;
1780+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1781+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
1782+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1783+
auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1784+
if (!Def)
1785+
continue;
1786+
// Check if we can replace this instruction with any of the
1787+
// visited instructions.
1788+
if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
1789+
#ifndef NDEBUG
1790+
VPTypeAnalysis TypeInfo(&CanonicalIVTy);
1791+
assert(TypeInfo.inferScalarType(Def) == TypeInfo.inferScalarType(V) &&
1792+
"CSE is attempting to perform an invalid replacement");
1793+
#endif
1794+
Def->replaceAllUsesWith(V);
1795+
Def->eraseFromParent();
1796+
} else {
1797+
CSEMap.insert_or_assign(Def, Def);
1798+
}
1799+
}
1800+
}
1801+
}
1802+
17581803
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
17591804
static void licm(VPlan &Plan) {
17601805
VPBasicBlock *Preheader = Plan.getVectorPreheader();
@@ -1930,6 +1975,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
19301975

19311976
runPass(createAndOptimizeReplicateRegions, Plan);
19321977
runPass(mergeBlocksIntoPredecessors, Plan);
1978+
runPass(cse, Plan, *Plan.getCanonicalIV()->getScalarType());
19331979
runPass(licm, Plan);
19341980
}
19351981

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define LLVM_TRANSFORMS_VECTORIZE_VPLANUTILS_H
1111

1212
#include "VPlan.h"
13+
#include "llvm/ADT/TypeSwitch.h"
1314

1415
namespace llvm {
1516
class ScalarEvolution;
@@ -37,6 +38,26 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
3738
/// SCEV expression could be constructed.
3839
const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
3940

41+
/// Get any instruction opcode data embedded in recipe \p R.
42+
inline std::optional<unsigned> getOpcode(const VPRecipeBase &R) {
43+
return TypeSwitch<const VPRecipeBase *, std::optional<unsigned>>(&R)
44+
.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
45+
VPWidenSelectRecipe>([](auto *I) { return I->getOpcode(); })
46+
.Default([](auto *) { return std::nullopt; });
47+
}
48+
49+
/// Get any result type data embedded in recipe \p R.
50+
inline Type *getResultTypeData(const VPRecipeBase &R) {
51+
if (auto *RTy = dyn_cast<VPInstructionWithType>(&R))
52+
return RTy->getResultType();
53+
if (auto *W = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
54+
if (const TruncInst *Trunc = W->getTruncInst())
55+
return Trunc->getDestTy();
56+
if (auto *S = dyn_cast<VPExpandSCEVRecipe>(&R))
57+
return S->getSCEV()->getType();
58+
return nullptr;
59+
}
60+
4061
/// Returns true if \p VPV is a single scalar, either because it produces the
4162
/// same value for all lanes or only has its first lane used.
4263
inline bool isSingleScalar(const VPValue *VPV) {

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,15 @@ class LLVM_ABI_FOR_TEST VPValue {
185185
assert(!UnderlyingVal && "Underlying Value is already set.");
186186
UnderlyingVal = Val;
187187
}
188+
189+
// Equality of data.
190+
bool isIdenticalTo(const VPValue *Other) const;
188191
};
189192

193+
// Hash method so VPValue can be de-duplicated in certain
194+
// contexts.
195+
hash_code hash_value(const VPValue &Arg);
196+
190197
typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
191198
typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
192199

llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
3333
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
3434
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
3535
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[OFFSET_IDX]]
36-
; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = shl i64 [[INDEX]], 1
37-
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX2]]
36+
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX]]
3837
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
3938
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
4039
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
@@ -75,8 +74,7 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
7574
; CHECK-NEXT: [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
7675
; CHECK-NEXT: [[OFFSET_IDX16:%.*]] = shl i64 [[INDEX15]], 1
7776
; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX16]]
78-
; CHECK-NEXT: [[OFFSET_IDX18:%.*]] = shl i64 [[INDEX15]], 1
79-
; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX18]]
77+
; CHECK-NEXT: [[NEXT_GEP19:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX16]]
8078
; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i16>, ptr [[NEXT_GEP17]], align 2
8179
; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[WIDE_LOAD20]], <4 x i16> [[BROADCAST_SPLAT22]])
8280
; CHECK-NEXT: store <4 x i16> [[TMP10]], ptr [[NEXT_GEP19]], align 2

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -316,8 +316,7 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
316316
; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
317317
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
318318
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
319-
; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
320-
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
319+
; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
321320
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
322321
; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
323322
; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP2]], align 2
@@ -357,8 +356,7 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
357356
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
358357
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
359358
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
360-
; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
361-
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
359+
; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
362360
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
363361
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
364362
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]]
@@ -410,8 +408,7 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
410408
; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
411409
; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
412410
; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
413-
; CHECK-MAXBW-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
414-
; CHECK-MAXBW-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
411+
; CHECK-MAXBW-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
415412
; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP]], align 2
416413
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
417414
; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP2]], align 2
@@ -1852,6 +1849,17 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
18521849
; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
18531850
; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
18541851
; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
1852+
; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
1853+
; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
1854+
; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP12]], [[TMP9]]
1855+
; CHECK-MAXBW-NEXT: [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP15]]
1856+
; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1857+
; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1858+
; CHECK-MAXBW-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
1859+
; CHECK-MAXBW: middle.block:
1860+
; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
1861+
; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
1862+
; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
18551863
; CHECK-MAXBW: scalar.ph:
18561864
;
18571865
entry:

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -564,11 +564,8 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
564564
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
565565
; CHECK-NEXT: [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
566566
; CHECK-NEXT: [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
567-
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
568-
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
569-
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
570567
; CHECK-NEXT: store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
571-
; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
568+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
572569
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
573570
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
574571
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -640,11 +637,8 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
640637
; CHECK-VF8-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
641638
; CHECK-VF8-NEXT: [[TMP16:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
642639
; CHECK-VF8-NEXT: [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
643-
; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
644-
; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
645-
; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP19]]
646640
; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP16]], ptr [[TMP11]], align 4
647-
; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP17]], ptr [[TMP20]], align 4
641+
; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP17]], ptr [[TMP15]], align 4
648642
; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
649643
; CHECK-VF8-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
650644
; CHECK-VF8-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -716,11 +710,8 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
716710
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
717711
; CHECK-NEXT: [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
718712
; CHECK-NEXT: [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
719-
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
720-
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
721-
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
722713
; CHECK-NEXT: store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
723-
; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
714+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
724715
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
725716
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
726717
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -792,11 +783,8 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
792783
; CHECK-VF8-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
793784
; CHECK-VF8-NEXT: [[TMP16:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
794785
; CHECK-VF8-NEXT: [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
795-
; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
796-
; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
797-
; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP19]]
798786
; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP16]], ptr [[TMP11]], align 4
799-
; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP17]], ptr [[TMP20]], align 4
787+
; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP17]], ptr [[TMP15]], align 4
800788
; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
801789
; CHECK-VF8-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
802790
; CHECK-VF8-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]

0 commit comments

Comments
 (0)