diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 765c613b04a44..86eb78dc70372 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -163,6 +163,9 @@ LLVM_ABI bool computeUnrollCount(
     TargetTransformInfo::UnrollingPreferences &UP,
     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound);
 
+LLVM_ABI std::optional<RecurrenceDescriptor>
+canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
+                                     ScalarEvolution *SE);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 86b268de43cf6..8a6c7789d1372 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
 #endif
                     );
 
+static cl::opt<bool> UnrollAddParallelReductions(
+    "unroll-add-parallel-reductions", cl::init(false), cl::Hidden,
+    cl::desc("Allow unrolling to add parallel reduction phis."));
 
 /// Check if unrolling created a situation where we need to insert phi nodes to
 /// preserve LCSSA form.
@@ -660,6 +664,39 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     OrigPHINode.push_back(cast<PHINode>(I));
   }
 
+  // Collect phi nodes for reductions for which we can introduce multiple
+  // parallel reduction phis and compute the final reduction result after the
+  // loop. This requires a single exit block after unrolling. This is ensured by
+  // restricting to single-block loops where the unrolled iterations are known
+  // to not exit.
+  DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
+  bool CanAddAdditionalAccumulators =
+      UnrollAddParallelReductions && !CompletelyUnroll &&
+      L->getNumBlocks() == 1 &&
+      (ULO.Runtime ||
+       (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
+                                        ExitInfos[Header].BreakoutTrip == 0))));
+
+  // Limit parallelizing reductions to unroll counts of 4 or less for now.
+  // TODO: The number of parallel reductions should depend on the number of
+  // execution units. We also don't have to add a parallel reduction phi per
+  // unrolled iteration, but could for example add a parallel phi for every 2
+  // unrolled iterations.
+  if (CanAddAdditionalAccumulators && ULO.Count <= 4) {
+    for (PHINode &Phi : Header->phis()) {
+      auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
+      if (!RdxDesc)
+        continue;
+
+      // Only handle duplicate phis for a single reduction for now.
+      // TODO: Handle any number of reductions
+      if (!Reductions.empty())
+        continue;
+
+      Reductions[&Phi] = *RdxDesc;
+    }
+  }
+
   std::vector<BasicBlock *> Headers;
   std::vector<BasicBlock *> Latches;
   Headers.push_back(Header);
@@ -710,6 +747,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   // latch.  This is a reasonable default placement if we don't have block
   // frequencies, and if we do, well the layout will be adjusted later.
   auto BlockInsertPt = std::next(LatchBlock->getIterator());
+  SmallVector<Instruction *> PartialReductions;
   for (unsigned It = 1; It != ULO.Count; ++It) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -733,6 +771,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         for (PHINode *OrigPHI : OrigPHINode) {
           PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
           Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+
+          // Use cloned phis as parallel phis for partial reductions, which will
+          // get combined to the final reduction result after the loop.
+          if (Reductions.contains(OrigPHI)) {
+            // Collect partial  reduction results.
+            if (PartialReductions.empty())
+              PartialReductions.push_back(cast<Instruction>(InVal));
+            PartialReductions.push_back(cast<Instruction>(VMap[InVal]));
+
+            // Update the start value for the cloned phis to use the identity
+            // value for the reduction.
+            const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI];
+            NewPHI->setIncomingValueForBlock(
+                L->getLoopPreheader(),
+                getRecurrenceIdentity(RdxDesc.getRecurrenceKind(),
+                                      OrigPHI->getType(),
+                                      RdxDesc.getFastMathFlags()));
+
+            // Update NewPHI to use the cloned value for the iteration and move
+            // to header.
+            NewPHI->replaceUsesOfWith(InVal, VMap[InVal]);
+            NewPHI->moveBefore(OrigPHI->getIterator());
+            continue;
+          }
+
           if (Instruction *InValI = dyn_cast<Instruction>(InVal))
             if (It > 1 && L->contains(InValI))
               InVal = LastValueMap[InValI];
@@ -832,6 +895,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
       PN->eraseFromParent();
     } else if (ULO.Count > 1) {
+      if (Reductions.contains(PN))
+        continue;
+
       Value *InVal = PN->removeIncomingValue(LatchBlock, false);
       // If this value was defined in the loop, take the value defined by the
       // last iteration of the loop.
@@ -1010,6 +1076,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     }
   }
 
+  // If there are partial reductions, create code in the exit block to compute
+  // the final result and update users of the final result.
+  if (!PartialReductions.empty()) {
+    BasicBlock *ExitBlock = L->getExitBlock();
+    assert(ExitBlock &&
+           "Can only introduce parallel reduction phis with single exit block");
+    assert(Reductions.size() == 1 &&
+           "currently only a single reduction is supported");
+    Value *FinalRdxValue = PartialReductions.back();
+    Value *RdxResult = nullptr;
+    for (PHINode &Phi : ExitBlock->phis()) {
+      if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue)
+        continue;
+      if (!RdxResult) {
+        RdxResult = PartialReductions.front();
+        IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
+        RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
+        for (Instruction *RdxPart : drop_begin(PartialReductions)) {
+          RdxResult = Builder.CreateBinOp(
+              (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
+              RdxPart, RdxResult, "bin.rdx");
+        }
+        NeedToFixLCSSA = true;
+        for (Instruction *RdxPart : PartialReductions)
+          RdxPart->dropPoisonGeneratingFlags();
+      }
+
+      Phi.replaceAllUsesWith(RdxResult);
+      continue;
+    }
+  }
+
   if (DTUToUse) {
     // Apply updates to the DomTree.
     DT = &DTU.getDomTree();
@@ -1111,3 +1209,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
   }
   return nullptr;
 }
+
+std::optional<RecurrenceDescriptor>
+llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
+                                           ScalarEvolution *SE) {
+  RecurrenceDescriptor RdxDesc;
+  if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc,
+                                            /*DemandedBits=*/nullptr,
+                                            /*AC=*/nullptr, /*DT=*/nullptr, SE))
+    return std::nullopt;
+  RecurKind RK = RdxDesc.getRecurrenceKind();
+  // Skip unsupported reductions.
+  // TODO: Handle additional reductions, including FP and min-max
+  // reductions.
+  if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
+      RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
+    return std::nullopt;
+
+  if (RdxDesc.IntermediateStore)
+    return std::nullopt;
+
+  // Don't unroll reductions with constant ops; those can be folded to a
+  // single induction update.
+  if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch()))
+                 ->operands(),
+             IsaPred<Constant>))
+    return std::nullopt;
+
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch ||
+      !is_contained(
+          cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(),
+          &Phi))
+    return std::nullopt;
+
+  return RdxDesc;
+}
diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
index dd2913d9fa1c4..2d48d20ba9c5c 100644
--- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
+++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
+; RUN: opt -p loop-unroll -unroll-add-parallel-reductions -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
 
 define i32 @test_add(ptr %src, i64 %n, i32 %start) {
 ; CHECK-LABEL: define i32 @test_add(
@@ -8,27 +8,33 @@ define i32 @test_add(ptr %src, i64 %n, i32 %start) {
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
 ; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
-; CHECK-NEXT:    [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_1]], [[L_1]]
 ; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
 ; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
-; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
+; CHECK-NEXT:    [[RDX_NEXT_2]] = add i32 [[RDX_NEXT_1]], [[L_2]]
 ; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
 ; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
 ; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
-; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[RDX_NEXT_24]] = add i32 [[RDX_3]], [[L_24]]
 ; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
 ; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_3]], [[RDX_NEXT]]
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = add i32 [[RDX_NEXT_24]], [[BIN_RDX1]]
 ; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
 ;
 entry:
@@ -203,33 +209,39 @@ define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) {
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_24:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]]
+; CHECK-NEXT:    [[RDX_1_NEXT]] = add i32 [[RDX_1]], [[L]]
 ; CHECK-NEXT:    [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]]
 ; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
-; CHECK-NEXT:    [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[RDX_1_NEXT_1]] = add i32 [[RDX_1_1]], [[L_1]]
 ; CHECK-NEXT:    [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]]
 ; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
 ; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
-; CHECK-NEXT:    [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]]
+; CHECK-NEXT:    [[RDX_1_NEXT_2]] = add i32 [[RDX_1_2]], [[L_2]]
 ; CHECK-NEXT:    [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]]
 ; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
 ; CHECK-NEXT:    [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
 ; CHECK-NEXT:    [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
-; CHECK-NEXT:    [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]]
+; CHECK-NEXT:    [[RDX_1_NEXT_24]] = add i32 [[RDX_1_3]], [[L_24]]
 ; CHECK-NEXT:    [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]]
 ; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
 ; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_1_NEXT_24]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[RDX_1_NEXT_1]], [[RDX_1_NEXT]]
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = add i32 [[RDX_1_NEXT_2]], [[BIN_RDX]]
+; CHECK-NEXT:    [[RDX_1_NEXT_LCSSA:%.*]] = add i32 [[RDX_1_NEXT_24]], [[BIN_RDX1]]
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -509,20 +521,26 @@ define i32 @test_add_with_call(i64 %n, i32 %start) {
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[L:%.*]] = call i32 @foo()
-; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
 ; CHECK-NEXT:    [[L_1:%.*]] = call i32 @foo()
-; CHECK-NEXT:    [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[L_1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = call i32 @foo()
-; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
+; CHECK-NEXT:    [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[L_2]]
 ; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
 ; CHECK-NEXT:    [[L_3:%.*]] = call i32 @foo()
-; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[L_3]]
 ; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
 ; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX1]]
 ; CHECK-NEXT:    ret i32 [[BIN_RDX2]]
 ;
 entry:
@@ -550,35 +568,41 @@ define i32 @test_add_with_backward_dep(ptr %p, i64 %n, i32 %start) {
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    store i32 0, ptr [[GEP_1]], align 4
-; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
+; CHECK-NEXT:    [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
 ; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
 ; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[GEP_11]], align 4
 ; CHECK-NEXT:    [[GEP_1_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
 ; CHECK-NEXT:    store i32 0, ptr [[GEP_1_1]], align 4
-; CHECK-NEXT:    [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[L_1]]
 ; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_1_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
 ; CHECK-NEXT:    store i32 0, ptr [[GEP_1_2]], align 4
-; CHECK-NEXT:    [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
+; CHECK-NEXT:    [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[L_2]]
 ; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
 ; CHECK-NEXT:    [[L_3:%.*]] = load i32, ptr [[GEP_3]], align 4
 ; CHECK-NEXT:    [[GEP_1_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_3]]
 ; CHECK-NEXT:    store i32 0, ptr [[GEP_1_3]], align 4
-; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[L_3]]
 ; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
 ; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]]
 ; CHECK-NEXT:    ret i32 [[BIN_RDX3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
index 89f06ad373aa9..0b9c6ac1d324b 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-unroll -S %s | FileCheck %s
+; RUN: opt -p loop-unroll -unroll-add-parallel-reductions -S %s | FileCheck %s
 
 define i32 @test_add_reduction(ptr %a, i64 %n) {
 ; CHECK-LABEL: define i32 @test_add_reduction(
@@ -14,15 +14,16 @@ define i32 @test_add_reduction(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[RDX_NEXT]] = add i32 [[RDX]], [[TMP2]]
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
-; CHECK-NEXT:    [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP3]]
 ; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
 ; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
 ; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
@@ -31,11 +32,12 @@ define i32 @test_add_reduction(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
 ; CHECK-NEXT:    br label %[[EXIT_UNR_LCSSA]]
 ; CHECK:       [[EXIT_UNR_LCSSA]]:
-; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
 ; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
 ; CHECK:       [[LOOP_EPIL_PREHEADER]]: