llvm
diff --git a/‎llvm/include/llvm/Transforms/Utils/LoopUtils.h
Lines changed: 17 additions & 8 deletions b/‎llvm/include/llvm/Transforms/Utils/LoopUtils.h
Lines changed: 17 additions & 8 deletions
diff --git a/‎llvm/lib/Transforms/Utils/LoopPeel.cpp
Lines changed: 52 additions & 93 deletions b/‎llvm/lib/Transforms/Utils/LoopPeel.cpp
Lines changed: 52 additions & 93 deletions
diff --git a/‎llvm/lib/Transforms/Utils/LoopUtils.cpp
Lines changed: 17 additions & 3 deletions b/‎llvm/lib/Transforms/Utils/LoopUtils.cpp
Lines changed: 17 additions & 3 deletions
diff --git a/‎llvm/test/Transforms/LoopUnroll/peel-branch-weights-simple.ll
Lines changed: 66 additions & 0 deletions b/‎llvm/test/Transforms/LoopUnroll/peel-branch-weights-simple.ll
Lines changed: 66 additions & 0 deletions
@@ -315,7 +315,8 @@ TransformationMode hasLICMVersioningTransformation(const Loop *L);
 void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                              unsigned V = 0);
 
-/// Returns a loop's estimated trip count based on branch weight metadata.
+/// Returns a loop's estimated trip count based on
+/// llvm.loop.estimated_trip_count metadata or, if none, branch weight metadata.
 /// In addition if \p EstimatedLoopInvocationWeight is not null it is
 /// initialized with weight of loop's latch leading to the exit.
 /// Returns a valid positive trip count, saturated at UINT_MAX, or std::nullopt
@@ -324,13 +325,21 @@ std::optional<unsigned>
 getLoopEstimatedTripCount(Loop *L,
                           unsigned *EstimatedLoopInvocationWeight = nullptr);
 
-/// Set a loop's branch weight metadata to reflect that loop has \p
-/// EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight exits
-/// through latch. Returns true if metadata is successfully updated, false
-/// otherwise. Note that loop must have a latch block which controls loop exit
-/// in order to succeed.
-bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
-                               unsigned EstimatedLoopInvocationWeight);
+/// Set a loop's llvm.loop.estimated_trip_count metadata and, if \p
+/// EstimatedLoopInvocationWeight, branch weight metadata to reflect that loop
+/// has \p EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight
+/// exit weight through latch. Returns true if metadata is successfully updated,
+/// false otherwise. Note that loop must have a latch block which controls loop
+/// exit in order to succeed.
+///
+/// The use case for not setting branch weight metadata is when the original
+/// branch weight metadata is correct for computing block frequencies but the
+/// trip count has changed due to a loop transformation.  The branch weight
+/// metadata cannot be adjusted to reflect the new trip count, so we store the
+/// new trip count separately.
+bool setLoopEstimatedTripCount(
+    Loop *L, unsigned EstimatedTripCount,
+    std::optional<unsigned> EstimatedLoopInvocationWeight);
 
 /// Check inner loop (L) backedge count is known to be invariant on all
 /// iterations of its outer loop. If the loop has no parent, this is trivially
 
@@ -655,84 +655,6 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   }
 }
 
-struct WeightInfo {
-  // Weights for current iteration.
-  SmallVector<uint32_t> Weights;
-  // Weights to subtract after each iteration.
-  const SmallVector<uint32_t> SubWeights;
-};
-
-/// Update the branch weights of an exiting block of a peeled-off loop
-/// iteration.
-/// Let F is a weight of the edge to continue (fallthrough) into the loop.
-/// Let E is a weight of the edge to an exit.
-/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
-/// go to exit.
-/// Then, Estimated ExitCount = F / E.
-/// For I-th (counting from 0) peeled off iteration we set the weights for
-/// the peeled exit as (EC - I, 1). It gives us reasonable distribution,
-/// The probability to go to exit 1/(EC-I) increases. At the same time
-/// the estimated exit count in the remainder loop reduces by I.
-/// To avoid dealing with division rounding we can just multiple both part
-/// of weights to E and use weight as (F - I * E, E).
-static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
-  setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
-  for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
-    if (SubWeight != 0)
-      // Don't set the probability of taking the edge from latch to loop header
-      // to less than 1:1 ratio (meaning Weight should not be lower than
-      // SubWeight), as this could significantly reduce the loop's hotness,
-      // which would be incorrect in the case of underestimating the trip count.
-      Info.Weights[Idx] =
-          Info.Weights[Idx] > SubWeight
-              ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
-              : SubWeight;
-}
-
-/// Initialize the weights for all exiting blocks.
-static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
-                              Loop *L) {
-  SmallVector<BasicBlock *> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-  for (BasicBlock *ExitingBlock : ExitingBlocks) {
-    Instruction *Term = ExitingBlock->getTerminator();
-    SmallVector<uint32_t> Weights;
-    if (!extractBranchWeights(*Term, Weights))
-      continue;
-
-    // See the comment on updateBranchWeights() for an explanation of what we
-    // do here.
-    uint32_t FallThroughWeights = 0;
-    uint32_t ExitWeights = 0;
-    for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
-      if (L->contains(Succ))
-        FallThroughWeights += Weight;
-      else
-        ExitWeights += Weight;
-    }
-
-    // Don't try to update weights for degenerate case.
-    if (FallThroughWeights == 0)
-      continue;
-
-    SmallVector<uint32_t> SubWeights;
-    for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
-      if (!L->contains(Succ)) {
-        // Exit weights stay the same.
-        SubWeights.push_back(0);
-        continue;
-      }
-
-      // Subtract exit weights on each iteration, distributed across all
-      // fallthrough edges.
-      double W = (double)Weight / (double)FallThroughWeights;
-      SubWeights.push_back((uint32_t)(ExitWeights * W));
-    }
-
-    WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}});
-  }
-}
-
 /// Clones the body of the loop L, putting it between \p InsertTop and \p
 /// InsertBot.
 /// \param IterNumber The serial number of the iteration currently being
@@ -1006,11 +928,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   Instruction *LatchTerm =
       cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator());
 
-  // If we have branch weight information, we'll want to update it for the
-  // newly created branches.
-  DenseMap<Instruction *, WeightInfo> Weights;
-  initBranchWeights(Weights, L);
-
   // Identify what noalias metadata is inside the loop: if it is inside the
   // loop, the associated metadata must be cloned for each iteration.
   SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
@@ -1038,11 +955,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
 #endif
 
-    for (auto &[Term, Info] : Weights) {
-      auto *TermCopy = cast<Instruction>(VMap[Term]);
-      updateBranchWeights(TermCopy, Info);
-    }
-
     // Remove Loop metadata from the latch branch instruction
     // because it is not the Loop's latch branch anymore.
     auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]);
@@ -1068,15 +980,62 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
   }
 
-  for (const auto &[Term, Info] : Weights) {
-    setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
-  }
-
   // Update Metadata for count of peeled off iterations.
   unsigned AlreadyPeeled = 0;
   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
     AlreadyPeeled = *Peeled;
-  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
+  unsigned TotalPeeled = AlreadyPeeled + PeelCount;
+  addStringMetadataToLoop(L, PeeledCountMetaData, TotalPeeled);
+
+  // Update metadata for the estimated trip count.  The original branch weight
+  // metadata is already correct for both the remaining loop and the peeled loop
+  // iterations, so don't adjust it.
+  //
+  // For example, consider what happens when peeling 2 iterations from a loop
+  // with an estimated trip count of 10 and inserting them before the remaining
+  // loop.  Each of the peeled iterations and each iteration in the remaining
+  // loop still has the same probability of exiting the *entire original* loop
+  // as it did when in the original loop, and thus it should still have the same
+  // branch weights.  The peeled iterations' non-zero probabilities of exiting
+  // already appropriately reduce the probability of reaching the remaining
+  // iterations just as they did in the original loop.  Trying to also adjust
+  // the remaining loop's branch weights to reflect its new trip count of 8 will
+  // erroneously further reduce its block frequencies.  However, in case an
+  // analysis later needs to determine the trip count of the remaining loop
+  // while examining it in isolation without considering the probability of
+  // actually reaching it, we store the new trip count as separate metadata.
+  //
+  // FIXME: getLoopEstimatedTripCount and setLoopEstimatedTripCount skip loops
+  // that don't match the restrictions of getExpectedExitLoopLatchBranch in
+  // LoopUtils.cpp.  For example,
+  // llvm/tests/Transforms/LoopUnroll/peel-branch-weights.ll (introduced by
+  // b43a4d0850d5) has multiple exits.  Should we try to extend them to handle
+  // such cases?  For now, we just don't try to record
+  // llvm.loop.estimated_trip_count for such cases, so the original branch
+  // weights will have to do.
+  if (auto EstimatedTripCount = getLoopEstimatedTripCount(L)) {
+    // FIXME: The previous updateBranchWeights implementation had this
+    // comment:
+    //
+    //   Don't set the probability of taking the edge from latch to loop header
+    //   to less than 1:1 ratio (meaning Weight should not be lower than
+    //   SubWeight), as this could significantly reduce the loop's hotness,
+    //   which would be incorrect in the case of underestimating the trip count.
+    //
+    // See e8d5db206c2f commit log for further discussion.  That seems to
+    // suggest that we should avoid ever setting a trip count of < 2 here
+    // (equal chance of continuing and exiting means the loop will likely
+    // continue once and then exit once).  Or is keeping the original branch
+    // weights already a sufficient improvement for whatever analysis cares
+    // about this case?
+    unsigned EstimatedTripCountNew = *EstimatedTripCount;
+    if (EstimatedTripCountNew < TotalPeeled) // FIXME: TotalPeeled + 2?
+      EstimatedTripCountNew = 0;             // FIXME: = 2?
+    else
+      EstimatedTripCountNew -= TotalPeeled;
+    setLoopEstimatedTripCount(L, EstimatedTripCountNew,
+                              /*EstimatedLoopInvocationWeight=*/std::nullopt);
+  }
 
   if (Loop *ParentLoop = L->getParentLoop())
     L = ParentLoop;
 
@@ -53,6 +53,8 @@ using namespace llvm::PatternMatch;
 
 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
 static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
+static const char *LLVMLoopEstimatedTripCount =
+    "llvm.loop.estimated_trip_count";
 
 bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
                                    MemorySSAUpdater *MSSAU,
@@ -864,27 +866,39 @@ llvm::getLoopEstimatedTripCount(Loop *L,
             getEstimatedTripCount(LatchBranch, L, ExitWeight)) {
       if (EstimatedLoopInvocationWeight)
         *EstimatedLoopInvocationWeight = ExitWeight;
+      // FIXME: Where else are branch weights directly used for estimating loop
+      // trip counts?  They should also be updated to use
+      // LLVMLoopEstimatedTripCount when present... or to just call this
+      // function.
+      if (auto EstimatedTripCount =
+              getOptionalIntLoopAttribute(L, LLVMLoopEstimatedTripCount))
+        return EstimatedTripCount;
       return *EstTripCount;
     }
   }
   return std::nullopt;
 }
 
-bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
-                                     unsigned EstimatedloopInvocationWeight) {
+bool llvm::setLoopEstimatedTripCount(
+    Loop *L, unsigned EstimatedTripCount,
+    std::optional<unsigned> EstimatedloopInvocationWeight) {
   // At the moment, we currently support changing the estimate trip count of
   // the latch branch only.  We could extend this API to manipulate estimated
   // trip counts for any exit.
   BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
   if (!LatchBranch)
     return false;
 
+  addStringMetadataToLoop(L, LLVMLoopEstimatedTripCount, EstimatedTripCount);
+  if (!EstimatedloopInvocationWeight)
+    return true;
+
   // Calculate taken and exit weights.
   unsigned LatchExitWeight = 0;
   unsigned BackedgeTakenWeight = 0;
 
   if (EstimatedTripCount > 0) {
-    LatchExitWeight = EstimatedloopInvocationWeight;
+    LatchExitWeight = *EstimatedloopInvocationWeight;
     BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
   }
 
 
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt < %s -S -passes=loop-unroll -unroll-force-peel-count=2 2>&1 | FileCheck %s
+
+declare void @f(i32)
+
+; Test branch weights and estimated trip count metadata for simple loop after
+; peeling.
+define void @test(i32 %n) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[DO_BODY_PEEL_BEGIN:%.*]]
+; CHECK:       do.body.peel.begin:
+; CHECK-NEXT:    br label [[DO_BODY_PEEL:%.*]]
+; CHECK:       do.body.peel:
+; CHECK-NEXT:    [[INC_PEEL:%.*]] = add i32 0, 1
+; CHECK-NEXT:    call void @f(i32 0)
+; CHECK-NEXT:    [[C_PEEL:%.*]] = icmp sge i32 [[INC_PEEL]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C_PEEL]], label [[DO_END:%.*]], label [[DO_BODY_PEEL_NEXT:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       do.body.peel.next:
+; CHECK-NEXT:    br label [[DO_BODY_PEEL2:%.*]]
+; CHECK:       do.body.peel2:
+; CHECK-NEXT:    [[INC_PEEL3:%.*]] = add i32 [[INC_PEEL]], 1
+; CHECK-NEXT:    call void @f(i32 [[INC_PEEL]])
+; CHECK-NEXT:    [[C_PEEL4:%.*]] = icmp sge i32 [[INC_PEEL3]], [[N]]
+; CHECK-NEXT:    br i1 [[C_PEEL4]], label [[DO_END]], label [[DO_BODY_PEEL_NEXT1:%.*]], !prof [[PROF0]]
+; CHECK:       do.body.peel.next1:
+; CHECK-NEXT:    br label [[DO_BODY_PEEL_NEXT5:%.*]]
+; CHECK:       do.body.peel.next5:
+; CHECK-NEXT:    br label [[ENTRY_PEEL_NEWPH:%.*]]
+; CHECK:       entry.peel.newph:
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[INC_PEEL3]], [[ENTRY_PEEL_NEWPH]] ], [ [[INC:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    call void @f(i32 [[I]])
+; CHECK-NEXT:    [[C:%.*]] = icmp sge i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[C]], label [[DO_END_LOOPEXIT:%.*]], label [[DO_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK:       do.end.loopexit:
+; CHECK-NEXT:    br label [[DO_END]]
+; CHECK:       do.end:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp sge i32 %inc, %n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 9}
+
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 9}
+; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK: [[META2]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK: [[META3]] = !{!"llvm.loop.estimated_trip_count", i32 8}
+; CHECK: [[META4]] = !{!"llvm.loop.unroll.disable"}
+;.