[InstCombine] Match intrinsic recurrences when known to be hoisted #149858

antoniofrighetto · 2025-07-21T17:36:34Z

For value-accumulating recurrences of kind:

  %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)

The binary intrinsic may be simplified into an intrinsic with init
value and the other operand, if the latter is loop-invariant:

  %umax = call i8 @llvm.umax.i8(i8 %a, i8 %b)

Proofs: https://alive2.llvm.org/ce/z/ea2cVC.

Fixes: #145875.

llvmbot · 2025-07-21T17:37:14Z

@llvm/pr-subscribers-llvm-transforms

Author: Antonio Frighetto (antoniofrighetto)

Changes

For value-accumulating recurrences of kind:

  %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
  %umax = call i8 @<!-- -->llvm.umax.i8(i8 %umax.acc, i8 %b)

The binary intrinsic may be simplified into an intrinsic with init
value and the other operand, if the latter is loop-invariant:

  %umax = call i8 @<!-- -->llvm.umax.i8(i8 %a, i8 %b)

Fixes: #145875.

Full diff: https://github.com/llvm/llvm-project/pull/149858.diff

2 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (+48)
(added) llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll (+125)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d88bc2c4901c7..c0abad6552440 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1532,6 +1532,46 @@ static Instruction *foldBitOrderCrossLogicOp(Value *V,
   return nullptr;
 }
 
+static bool foldBinaryIntrinsicRecurrence(InstCombinerImpl &IC,
+                                          IntrinsicInst *II) {
+  PHINode *PN;
+  Value *Init, *OtherOp;
+
+  // A binary intrinsic recurrence with loop-invariant operands is equivalent to
+  // `call @llvm.binary.intrinsic(Init, OtherOp)`.
+  if (!matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) ||
+      isa<Constant>(OtherOp) || !PN->hasOneUse() ||
+      !IC.getDominatorTree().dominates(OtherOp, PN))
+    return false;
+
+  auto IID = II->getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::maxnum:
+  case Intrinsic::minnum:
+  case Intrinsic::maximum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximumnum:
+  case Intrinsic::minimumnum:
+  case Intrinsic::smax:
+  case Intrinsic::smin:
+  case Intrinsic::umax:
+  case Intrinsic::umin:
+    break;
+  default:
+    return false;
+  }
+
+  IC.Builder.SetInsertPoint(&*PN->getParent()->getFirstInsertionPt());
+  auto *InvariantBinaryInst =
+      cast<IntrinsicInst>(IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp));
+  if (isa<FPMathOperator>(II))
+    InvariantBinaryInst->copyFastMathFlags(II);
+  InvariantBinaryInst->takeName(II);
+  IC.eraseInstFromFunction(*IC.replaceInstUsesWith(*II, InvariantBinaryInst));
+  IC.eraseInstFromFunction(*PN);
+  return true;
+}
+
 static Value *simplifyReductionOperand(Value *Arg, bool CanReorderLanes) {
   if (!CanReorderLanes)
     return nullptr;
@@ -3906,6 +3946,14 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   if (Value *Reverse = foldReversedIntrinsicOperands(II))
     return replaceInstUsesWith(*II, Reverse);
 
+  // Attempt to simplify value-accumulating recurrences of kind:
+  //   %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
+  //   %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  // And let the binary intrinsic be hoisted, when the operands are known to be
+  // loop-invariant.
+  if (foldBinaryIntrinsicRecurrence(*this, II))
+    return nullptr;
+
   // Some intrinsics (like experimental_gc_statepoint) can be used in invoke
   // context, so it is handled in visitCallBase and we should trigger it.
   return visitCallBase(*II);
diff --git a/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll b/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll
new file mode 100644
index 0000000000000..4fa54ba6f023c
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i8 @simple_recurrence_intrinsic(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i8 @llvm.umax.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+define float @simple_recurrence_intrinsic_2(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_2(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FMAX:%.*]] = call nnan float @llvm.maximum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[FMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmax.acc = phi float [ %fmax, %loop ], [ %a, %entry ]
+  %fmax = call nnan float @llvm.maximum.f32(float %fmax.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmax
+}
+
+define i8 @simple_recurrence_intrinsic_multiuse_umax(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_multiuse_umax(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX_ACC:%.*]] = phi i8 [ [[UMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    call void @use(i8 [[UMAX_ACC]])
+; CHECK-NEXT:    [[UMAX]] = call i8 @llvm.umax.i8(i8 [[UMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  call void @use(i8 %umax.acc)
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+define i8 @simple_recurrence_intrinsic_arg_loop_variant(i8 %n, i8 %a) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_arg_loop_variant(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX_ACC:%.*]] = phi i8 [ [[UMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[B:%.*]] = xor i8 [[IV]], 42
+; CHECK-NEXT:    [[UMAX]] = call i8 @llvm.umax.i8(i8 [[UMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  %b = xor i8 %iv, 42
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+declare void @use(i8)

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

nikic · 2025-07-22T14:01:18Z

llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll

Negative test with an invalid intrinsic?

Added a test w/ copysign, thanks (think we could add that too, but I wouldn't expect to see a copysign in a recurrence?).

I'd suggest using something like uadd.sat or usub.sat, where the transform would be invalid because the intrinsic is not idempotent (in the sense that f(f(x, y), y) != f(x, y)).

That was indeed a poor negative test, updated, thanks.

For value-accumulating recurrences of kind: ``` %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ] %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b) ``` The binary intrinsic may be simplified into an intrinsic with init value and the other operand, if the latter is loop-invariant: ``` %umax = call i8 @llvm.umax.i8(i8 %a, i8 %b) ``` Fixes: llvm#145875.

nikic

LGTM

dtcxzyw

LG

dtcxzyw · 2025-07-24T13:43:28Z

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

+  auto *InvariantBinaryInst =
+      IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp);
+  if (isa<FPMathOperator>(InvariantBinaryInst))
+    cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II);


Suggested change

auto *InvariantBinaryInst =

IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp);

if (isa<FPMathOperator>(InvariantBinaryInst))

cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II);

auto *InvariantBinaryInst =

IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp, /*FMFSource=*/II);

That's unfortunately not possible per assertion isa<FPMathOperator>(this) && "getting fast-math flag on invalid op"' on non-fast-math binary intrinsics. Perhaps we may want to change this in IRBuilder intrinsic creation in the future.

Use dyn_cast<FPMathOperator>(II)?

In this case, I think you'd need another dyn_cast<Instruction>(InvariantBinaryInst) to check that CreateBinaryIntrinsic didn't constant-fold, as copyFastMathFlags is private to FPMathOperator. Think the current one may be cleaner.

nikic · 2025-07-28T19:25:07Z

Hm, I'm curious whether we would get this optimization "for free" if we called foldOpIntoPhi for intrinsics.

antoniofrighetto · 2025-07-29T09:31:23Z

Hm, I'm curious whether we would get this optimization "for free" if we called foldOpIntoPhi for intrinsics.

Tried by pivoting the fold in the intrinsics visitor as follows:

PHINode *PN;
Value *Init, *OtherOp;
if (matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) &&
    DT.dominates(OtherOp, PN))
  if (auto *Res = foldOpIntoPhi(*II, PN, true))
    return Res;

We fold correctly the one-use phi case, however for the multiuse we bail out in:

llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Lines 1947 to 1948 in c9ceb9b

    
           if (!OneUse && !IdenticalUsers) 
        
             return nullptr;

May as well take a deeper look as a follow-up patch, if you say.

nikic · 2025-07-29T09:58:22Z

@antoniofrighetto What I had in mind is something more general along these lines: #151115 (I took the liberty of pushing your tests to main so that shows the diff.) This doesn't do any recurrence check upfront.

You're right that this can't handle the multi-use case. The question then would be whether handling multi-use is worth a dedicated fold. (Though possibly we could extend foldOpIntoPhi to handle this specific multi-use case, not sure...)

antoniofrighetto · 2025-07-29T10:37:53Z

@nikic Oh, I see that now, thanks; that indeed looks like a nice extension. Both using the recurrence helper and foldOpIntoPhi seem like cheap checks, so I assume the choice here mostly boils down to having less code around. If we go with your approach, I may take a look into supporting the specific multi-use case (the second run on the benchmark suite shows some nice little improvements with multi-use phi, suggesting it could indeed be worthwhile to have it). My only thought is that foldBinaryIntrinsicRecurrence doesn't look particularly invasive and, considering we have already some matchSimpleRecurrence folds around, it might make sense to have a dedicated fold for intrinsic recurrences as well.

antoniofrighetto · 2025-07-29T12:13:45Z

The question then would be whether handling multi-use is worth a dedicated fold. (Though possibly we could extend foldOpIntoPhi to handle this specific multi-use case, not sure...)

Comparing the results of the first run dtcxzyw/llvm-opt-benchmark#2596 (w/o multi-use phi) and the one in #151115, and it seems like there are four test cases that were not folded via foldOpIntoPhi. Wouldn't this imply that #151115 does not entirely subsume this fold?

nikic · 2025-07-29T12:28:53Z

@antoniofrighetto The diff only shows a subset of changed files due to github limitations. It's hard to say whether the transform did not happen or whether it's just missing from the diff. Need to run one on top of the other to do that reliably.

antoniofrighetto · 2025-07-29T12:37:12Z

@antoniofrighetto The diff only shows a subset of changed files due to github limitations. It's hard to say whether the transform did not happen or whether it's just missing from the diff. Need to run one on top of the other to do that reliably.

The diff should be a subset, though, I thought the list here dtcxzyw/llvm-opt-benchmark#2616 (comment) would encompass all the changes.

antoniofrighetto requested a review from dtcxzyw July 21, 2025 17:36

antoniofrighetto requested a review from nikic as a code owner July 21, 2025 17:36

llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Jul 21, 2025

antoniofrighetto mentioned this pull request Jul 22, 2025

Task submission dtcxzyw/llvm-opt-benchmark#1312

Open

zyw-bot mentioned this pull request Jul 22, 2025

pre-commit: PR149858 dtcxzyw/llvm-opt-benchmark#2596

Closed

nikic reviewed Jul 22, 2025

View reviewed changes

antoniofrighetto added 2 commits July 24, 2025 12:23

[InstCombine] Introduce test for PR149858 (NFC)

2ead2f9

antoniofrighetto force-pushed the feature/ic-fold-intrinsic-recurrence branch from 42ddcf9 to 522b476 Compare July 24, 2025 10:23

zyw-bot mentioned this pull request Jul 24, 2025

pre-commit: PR149858 dtcxzyw/llvm-opt-benchmark#2607

Closed

nikic approved these changes Jul 24, 2025

View reviewed changes

dtcxzyw approved these changes Jul 24, 2025

View reviewed changes

!fixup refine test

6068f4e

nikic mentioned this pull request Jul 29, 2025

[InstCombine] Support folding intrinsics into phis #151115

Open

[InstCombine] Match intrinsic recurrences when known to be hoisted #149858

Are you sure you want to change the base?

[InstCombine] Match intrinsic recurrences when known to be hoisted #149858

Conversation

antoniofrighetto commented Jul 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 21, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nikic commented Jul 28, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

nikic commented Jul 29, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

nikic commented Jul 29, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

Uh oh!

antoniofrighetto commented Jul 21, 2025 •

edited

Loading