[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps #150896

davemgreen · 2025-07-28T07:46:47Z

Similar to #150639 this fixes the AggressiveInstCombine fold for convert tables to cttz instructions if the gep types are not array types. i.e gep i16 @glob, i64 %idx instead of gep [64 x i16] @glob, i64 0, i64 %idx.

llvmbot · 2025-07-28T07:47:29Z

@llvm/pr-subscribers-llvm-transforms

Author: David Green (davemgreen)

Changes

Similar to #150639 this fixes the AggressiveInstCombine fold for convert tables to cttz instructions if the gep types are not array types. i.e gep i16 @glob, i64 %idx instead of gep [64 x i16] @glob, i64 0, i64 %idx.

Full diff: https://github.com/llvm/llvm-project/pull/150896.diff

3 Files Affected:

(modified) llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp (+17-15)
(modified) llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll (+47)
(modified) llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll (+37-5)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7af5ba4e0e103..975590214ffa6 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -547,14 +547,20 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     return false;
 
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
-  if (!GEP || !GEP->hasNoUnsignedSignedWrap() || GEP->getNumIndices() != 2)
+  if (!GEP || !GEP->hasNoUnsignedSignedWrap())
     return false;
 
-  if (!GEP->getSourceElementType()->isArrayTy())
-    return false;
-
-  uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();
-  if (ArraySize != 32 && ArraySize != 64)
+  Type *GEPSrcEltTy = GEP->getSourceElementType();
+  Value *GepIdx;
+  if (GEP->getNumIndices() == 2) {
+    if (!GEPSrcEltTy->isArrayTy() ||
+        !match(GEP->idx_begin()->get(), m_ZeroInt()))
+      return false;
+    GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
+    GepIdx = std::next(GEP->idx_begin())->get();
+  } else if (GEP->getNumIndices() == 1)
+    GepIdx = GEP->idx_begin()->get();
+  else
     return false;
 
   GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
@@ -563,21 +569,17 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
 
   ConstantDataArray *ConstData =
       dyn_cast<ConstantDataArray>(GVTable->getInitializer());
-  if (!ConstData)
-    return false;
-
-  if (!match(GEP->idx_begin()->get(), m_ZeroInt()))
+  if (!ConstData || ConstData->getElementType() != GEPSrcEltTy)
     return false;
 
-  Value *Idx2 = std::next(GEP->idx_begin())->get();
   Value *X1;
   uint64_t MulConst, ShiftConst;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
   // probably fail for other (e.g. 32-bit) targets.
-  if (!match(Idx2, m_ZExtOrSelf(
-                       m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
-                                    m_ConstantInt(MulConst)),
-                              m_ConstantInt(ShiftConst)))))
+  if (!match(GepIdx, m_ZExtOrSelf(m_LShr(
+                         m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
+                               m_ConstantInt(MulConst)),
+                         m_ConstantInt(ShiftConst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 4d571999df372..0bfa891a7887c 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -276,3 +276,50 @@ entry:
   %0 = load i32, ptr %arrayidx, align 4
   ret i32 %0
 }
+
+define i32 @ctz1_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz1_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz7.table, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @ctz2_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz2_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 72416175
+  %shr = lshr i32 %mul, 26
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll b/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
index 19fbc1f1ae64e..4455016c3e4a4 100644
--- a/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
@@ -1,3 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O3 -S < %s | FileCheck %s
+
 ;; This tests lowering of the implementations of table-based ctz
 ;; algorithm to the llvm.cttz instruction in the -O3 case.
 
@@ -13,13 +16,17 @@
 ;; }
 ;; Compiled as: clang -O3 test.c -S -emit-llvm -Xclang -disable-llvm-optzns
 
-; RUN: opt -O3 -S < %s | FileCheck %s
-
-; CHECK: call range(i32 0, 33) i32 @llvm.cttz.i32
-
 @ctz1.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 16
 
-define i32 @ctz1(i32 noundef %x) {
+define i32 @ctz(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, ptr %x.addr, align 4
@@ -35,3 +42,28 @@ entry:
   %conv = sext i8 %2 to i32
   ret i32 %conv
 }
+
+define i32 @ctz_nonarraygep(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz_nonarraygep(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %x.addr, align 4
+  %sub = sub i32 0, %1
+  %and = and i32 %0, %sub
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz1.table, i64 %idxprom
+  %2 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %2 to i32
+  ret i32 %conv
+}

nikic

Can you please directly port this one to be type independent (using ConstantFoldLoadFromConst)? It looks like it shouldn't be hard.

davemgreen · 2025-07-29T07:48:30Z

Well I wrote some changes and in the process got i16 working from i128 integers. This is the first time I've seen ConstantFoldLoadFromConst so let me know if you were thinking of something else.

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

nikic

LGTM, thanks!

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

davemgreen requested review from nikic and dtcxzyw July 28, 2025 07:46

llvmbot added the llvm:transforms label Jul 28, 2025

nikic reviewed Jul 28, 2025

View reviewed changes

nikic reviewed Jul 29, 2025

View reviewed changes

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp Outdated Show resolved Hide resolved

davemgreen force-pushed the gh-ic-gepcttz branch from 67cf707 to fcc6d0c Compare July 29, 2025 13:11

nikic mentioned this pull request Jul 29, 2025

Task submission dtcxzyw/llvm-opt-benchmark#1312

Open

zyw-bot mentioned this pull request Jul 29, 2025

pre-commit: PR150896 dtcxzyw/llvm-opt-benchmark#2621

Closed

nikic approved these changes Jul 29, 2025

View reviewed changes

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp Outdated Show resolved Hide resolved

dtcxzyw mentioned this pull request Jul 30, 2025

[InstCombine] Strip trailing zero GEP indices #151338

Merged

davemgreen added 5 commits July 31, 2025 07:35

[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps

762250f

Use ConstantFoldLoadFromConst, and get i16 tables working as a result

12de7e7

Attempt to get i8 gep offsets working

984319e

Use collectOffset, add some explanation

f6b9b8d

Get i128 working too

cfa67d8

davemgreen force-pushed the gh-ic-gepcttz branch from fcc6d0c to cfa67d8 Compare July 31, 2025 09:05

nikic approved these changes Jul 31, 2025

View reviewed changes

davemgreen merged commit 8f968fe into llvm:main Jul 31, 2025
9 checks passed

davemgreen deleted the gh-ic-gepcttz branch July 31, 2025 15:53

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps #150896

[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps #150896

Uh oh!

davemgreen commented Jul 28, 2025

Uh oh!

llvmbot commented Jul 28, 2025

Uh oh!

nikic left a comment

Uh oh!

davemgreen commented Jul 29, 2025

Uh oh!

Uh oh!

Uh oh!

nikic left a comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps #150896

[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps #150896

Uh oh!

Conversation

davemgreen commented Jul 28, 2025

Uh oh!

llvmbot commented Jul 28, 2025

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

davemgreen commented Jul 29, 2025

Uh oh!

Uh oh!

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!