@@ -180,6 +180,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
180
180
STATISTIC (LoopsVectorized, " Number of loops vectorized" );
181
181
STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
182
182
STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
183
+ STATISTIC (CSAsVectorized,
184
+ " Number of conditional scalar assignments vectorized" );
183
185
184
186
static cl::opt<bool > EnableEpilogueVectorization (
185
187
" enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -500,6 +502,10 @@ class InnerLoopVectorizer {
500
502
virtual std::pair<BasicBlock *, Value *>
501
503
createVectorizedLoopSkeleton (const SCEV2ValueTy &ExpandedSCEVs);
502
504
505
+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
506
+ // / loop with the extracted scalar from the vector loop for.
507
+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
508
+
503
509
// / Fix the vectorized code, taking care of header phi's, live-outs, and more.
504
510
void fixVectorizedLoop (VPTransformState &State, VPlan &Plan);
505
511
@@ -2932,6 +2938,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2932
2938
TargetTransformInfo::TCK_RecipThroughput);
2933
2939
}
2934
2940
2941
+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2942
+ for (const auto &CSA : Plan.getCSAStates ()) {
2943
+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2944
+ assert (VPDataUpdate &&
2945
+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2946
+ Value *V = VPDataUpdate->getUnderlyingValue ();
2947
+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2948
+ /* NeedsScalar=*/ true );
2949
+ // Fix LCSSAPhis
2950
+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2951
+ for (User *U : V->users ())
2952
+ if (auto *Phi = dyn_cast<PHINode>(U);
2953
+ Phi && Phi->getParent () == LoopExitBlock)
2954
+ ToFix.insert (Phi);
2955
+ for (PHINode *Phi : ToFix)
2956
+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2957
+ }
2958
+ }
2959
+
2935
2960
void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State,
2936
2961
VPlan &Plan) {
2937
2962
// Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2972,6 +2997,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
2972
2997
getOrCreateVectorTripCount (VectorLoop->getLoopPreheader ()),
2973
2998
IVEndValues[Entry.first ], LoopMiddleBlock,
2974
2999
VectorLoop->getHeader (), Plan, State);
3000
+
3001
+ fixCSALiveOuts (State, Plan);
2975
3002
}
2976
3003
2977
3004
// Fix live-out phis not already fixed earlier.
@@ -4482,6 +4509,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4482
4509
case VPDef::VPEVLBasedIVPHISC:
4483
4510
case VPDef::VPPredInstPHISC:
4484
4511
case VPDef::VPBranchOnMaskSC:
4512
+ case VPRecipeBase::VPCSADataUpdateSC:
4513
+ case VPRecipeBase::VPCSAExtractScalarSC:
4514
+ case VPRecipeBase::VPCSAHeaderPHISC:
4485
4515
continue ;
4486
4516
case VPDef::VPReductionSC:
4487
4517
case VPDef::VPActiveLaneMaskPHISC:
@@ -8508,9 +8538,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8508
8538
return Recipe;
8509
8539
8510
8540
VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8511
- assert ((Legal->isReductionVariable (Phi) ||
8512
- Legal->isFixedOrderRecurrence (Phi)) &&
8513
- " can only widen reductions and fixed-order recurrences here" );
8514
8541
VPValue *StartV = Operands[0 ];
8515
8542
if (Legal->isReductionVariable (Phi)) {
8516
8543
const RecurrenceDescriptor &RdxDesc =
@@ -8520,12 +8547,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8520
8547
PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
8521
8548
CM.isInLoopReduction (Phi),
8522
8549
CM.useOrderedReductions (RdxDesc));
8523
- } else {
8550
+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
8524
8551
// TODO: Currently fixed-order recurrences are modeled as chains of
8525
8552
// first-order recurrences. If there are no users of the intermediate
8526
8553
// recurrences in the chain, the fixed order recurrence should be modeled
8527
8554
// directly, enabling more efficient codegen.
8528
8555
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8556
+ } else if (Legal->isCSAPhi (Phi)) {
8557
+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8558
+ VPValue *InitData = State->getVPInitData ();
8559
+ // When the VF=getFixed(1), InitData is just InitScalar.
8560
+ if (!InitData)
8561
+ InitData = State->getVPInitScalar ();
8562
+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8563
+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8564
+ } else {
8565
+ llvm_unreachable (
8566
+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
8529
8567
}
8530
8568
8531
8569
PhisToFix.push_back (PhiRecipe);
@@ -8555,6 +8593,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8555
8593
make_range (Operands.begin (), Operands.end ()));
8556
8594
8557
8595
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8596
+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8597
+ return CSADescriptor::isCSASelect (CSA.second , SI);
8598
+ });
8599
+ if (CSADescIt != Legal->getCSAs ().end ()) {
8600
+ PHINode *CSAPhi = CSADescIt->first ;
8601
+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8602
+ VPValue *VPDataPhi = State->getPhiRecipe ();
8603
+ auto *R = new VPCSADataUpdateRecipe (
8604
+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8605
+ State->setDataUpdate (R);
8606
+ return R;
8607
+ }
8608
+
8558
8609
return new VPWidenSelectRecipe (
8559
8610
*SI, make_range (Operands.begin (), Operands.end ()));
8560
8611
}
@@ -8567,6 +8618,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8567
8618
return tryToWiden (Instr, Operands, VPBB);
8568
8619
}
8569
8620
8621
+ // / Add CSA Recipes that can occur before each instruction in the input IR
8622
+ // / is processed and introduced into VPlan.
8623
+ static void
8624
+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8625
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8626
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8627
+ VPlan &Plan) {
8628
+
8629
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8630
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8631
+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8632
+
8633
+ for (const auto &CSA : CSAs) {
8634
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8635
+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8636
+
8637
+ // Scalar VF builds the scalar version of the loop. In that case,
8638
+ // no maintenence of mask nor extraction in middle block is needed.
8639
+ if (IsScalarVF) {
8640
+ VPCSAState *S = new VPCSAState (VPInitScalar);
8641
+ Plan.addCSAState (CSA.first , S);
8642
+ continue ;
8643
+ }
8644
+
8645
+ auto *VPInitMask =
8646
+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8647
+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8648
+ {VPInitScalar}, DL, " csa.init.data" );
8649
+ PreheaderVPBB->appendRecipe (VPInitMask);
8650
+ PreheaderVPBB->appendRecipe (VPInitData);
8651
+
8652
+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8653
+ DL, " csa.mask.phi" );
8654
+ HeaderVPBB->appendRecipe (VPMaskPhi);
8655
+
8656
+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8657
+ Plan.addCSAState (CSA.first , S);
8658
+ }
8659
+ }
8660
+
8661
+ // / Add CSA Recipes that must occur after each instruction in the input IR
8662
+ // / is processed and introduced into VPlan.
8663
+ static void
8664
+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8665
+ const LoopVectorizationLegality::CSAList &CSAs,
8666
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8667
+ VPlan &Plan) {
8668
+ // Don't build CSA for VF=ElementCount::getFixed(1)
8669
+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8670
+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8671
+ return ;
8672
+
8673
+ for (const auto &CSA : CSAs) {
8674
+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8675
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8676
+
8677
+ assert (VPDataUpdate &&
8678
+ " VPDataUpdate must have been introduced prior to postprocess" );
8679
+ assert (CSA.second .getCond () &&
8680
+ " CSADescriptor must know how to describe the condition" );
8681
+ auto GetVPValue = [&](Value *I) {
8682
+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8683
+ };
8684
+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8685
+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8686
+
8687
+ // The CSA optimization wants to use a condition such that when it is
8688
+ // true, a new value is assigned. However, it is possible that a true lane
8689
+ // in WidenedCond corresponds to selection of the initial value instead.
8690
+ // In that case, we must use the negation of WidenedCond.
8691
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8692
+ VPValue *CondToUse = WidenedCond;
8693
+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8694
+ CSA.first ) {
8695
+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8696
+ VPNotCond->insertBefore (
8697
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8698
+ CondToUse = VPNotCond;
8699
+ }
8700
+
8701
+ auto *VPAnyActive = new VPInstruction (
8702
+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8703
+ VPAnyActive->insertBefore (
8704
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8705
+
8706
+ auto *VPMaskSel = new VPInstruction (
8707
+ VPInstruction::CSAMaskSel,
8708
+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8709
+ VPMaskSel->insertAfter (VPAnyActive);
8710
+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8711
+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8712
+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8713
+
8714
+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8715
+
8716
+ // Update CSAState with new recipes
8717
+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8718
+ CSAState->setVPAnyActive (VPAnyActive);
8719
+ }
8720
+ }
8721
+
8570
8722
void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
8571
8723
ElementCount MaxVF) {
8572
8724
assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8623,7 +8775,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8623
8775
// VPWidenPointerInductionRecipe and induction increments.
8624
8776
static MapVector<PHINode *, VPValue *> collectUsersInExitBlock (
8625
8777
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8626
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8778
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8779
+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
8627
8780
auto MiddleVPBB =
8628
8781
cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
8629
8782
// No edge from the middle block to the unique exit block has been inserted
@@ -8652,6 +8805,17 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
8652
8805
return P && Inductions.contains (P);
8653
8806
})))
8654
8807
continue ;
8808
+ // Exit values for CSAs are computed and updated outside of VPlan and
8809
+ // independent of induction recipes.
8810
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8811
+ // live-outs.
8812
+ if (isa<VPCSADataUpdateRecipe>(V) &&
8813
+ (isa<Instruction>(IncomingValue) &&
8814
+ any_of (IncomingValue->users (), [&CSAs](User *U) {
8815
+ auto *P = dyn_cast<PHINode>(U);
8816
+ return P && CSAs.contains (P);
8817
+ })))
8818
+ continue ;
8655
8819
ExitingValuesToFix.insert ({&ExitPhi, V});
8656
8820
}
8657
8821
return ExitingValuesToFix;
@@ -8893,6 +9057,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8893
9057
bool HasNUW = Style == TailFoldingStyle::None;
8894
9058
addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
8895
9059
9060
+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9061
+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9062
+ Range, *Plan);
9063
+
8896
9064
VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8897
9065
8898
9066
// ---------------------------------------------------------------------------
@@ -8999,6 +9167,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8999
9167
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
9000
9168
}
9001
9169
9170
+ VPBasicBlock *MiddleVPBB =
9171
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9172
+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9173
+ Range, *Plan);
9174
+
9002
9175
// After here, VPBB should not be used.
9003
9176
VPBB = nullptr ;
9004
9177
@@ -9008,8 +9181,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9008
9181
" VPBasicBlock" );
9009
9182
RecipeBuilder.fixHeaderPhis ();
9010
9183
9011
- MapVector<PHINode *, VPValue *> ExitingValuesToFix = collectUsersInExitBlock (
9012
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9184
+ MapVector<PHINode *, VPValue *> ExitingValuesToFix =
9185
+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9186
+ Legal->getInductionVars (), Legal->getCSAs ());
9013
9187
9014
9188
addLiveOutsForFirstOrderRecurrences (*Plan, ExitingValuesToFix);
9015
9189
addUsersInExitBlock (*Plan, ExitingValuesToFix);
@@ -10106,6 +10280,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10106
10280
const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan (
10107
10281
EPI.MainLoopVF , EPI.MainLoopUF , *BestMainPlan, MainILV, DT, true );
10108
10282
++LoopsVectorized;
10283
+ CSAsVectorized += LVL.getCSAs ().size ();
10109
10284
10110
10285
// Second pass vectorizes the epilogue and adjusts the control flow
10111
10286
// edges from the first pass.
@@ -10198,6 +10373,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10198
10373
PSI, Checks);
10199
10374
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10200
10375
++LoopsVectorized;
10376
+ CSAsVectorized += LVL.getCSAs ().size ();
10201
10377
10202
10378
// Add metadata to disable runtime unrolling a scalar loop when there
10203
10379
// are no runtime checks about strides and memory. A scalar loop that is
0 commit comments