[AggressiveInstCombine] Make cttz fold more resiliant to non-array geps

davemgreen · davemgreen · commit bc33a1426101 · 2025-07-29T14:04:44.000+01:00
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -547,14 +547,20 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     return false;
 
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
-  if (!GEP || !GEP->hasNoUnsignedSignedWrap() || GEP->getNumIndices() != 2)
+  if (!GEP || !GEP->hasNoUnsignedSignedWrap())
     return false;
 
-  if (!GEP->getSourceElementType()->isArrayTy())
-    return false;
-
-  uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();
-  if (ArraySize != 32 && ArraySize != 64)
+  Type *GEPSrcEltTy = GEP->getSourceElementType();
+  Value *GepIdx;
+  if (GEP->getNumIndices() == 2) {
+    if (!GEPSrcEltTy->isArrayTy() ||
+        !match(GEP->idx_begin()->get(), m_ZeroInt()))
+      return false;
+    GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
+    GepIdx = std::next(GEP->idx_begin())->get();
+  } else if (GEP->getNumIndices() == 1)
+    GepIdx = GEP->idx_begin()->get();
+  else
     return false;
 
   GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
@@ -563,21 +569,17 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
 
   ConstantDataArray *ConstData =
       dyn_cast<ConstantDataArray>(GVTable->getInitializer());
-  if (!ConstData)
-    return false;
-
-  if (!match(GEP->idx_begin()->get(), m_ZeroInt()))
+  if (!ConstData || ConstData->getElementType() != GEPSrcEltTy)
     return false;
 
-  Value *Idx2 = std::next(GEP->idx_begin())->get();
   Value *X1;
   uint64_t MulConst, ShiftConst;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
   // probably fail for other (e.g. 32-bit) targets.
-  if (!match(Idx2, m_ZExtOrSelf(
-                       m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
-                                    m_ConstantInt(MulConst)),
-                              m_ConstantInt(ShiftConst)))))
+  if (!match(GepIdx, m_ZExtOrSelf(m_LShr(
+                         m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
+                               m_ConstantInt(MulConst)),
+                         m_ConstantInt(ShiftConst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -276,3 +276,50 @@ entry:
   %0 = load i32, ptr %arrayidx, align 4
   ret i32 %0
 }
+
+define i32 @ctz1_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz1_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz7.table, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @ctz2_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz2_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 72416175
+  %shr = lshr i32 %mul, 26
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll b/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
@@ -1,3 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O3 -S < %s | FileCheck %s
+
 ;; This tests lowering of the implementations of table-based ctz
 ;; algorithm to the llvm.cttz instruction in the -O3 case.
 
@@ -13,13 +16,17 @@
 ;; }
 ;; Compiled as: clang -O3 test.c -S -emit-llvm -Xclang -disable-llvm-optzns
 
-; RUN: opt -O3 -S < %s | FileCheck %s
-
-; CHECK: call range(i32 0, 33) i32 @llvm.cttz.i32
-
 @ctz1.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 16
 
-define i32 @ctz1(i32 noundef %x) {
+define i32 @ctz(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, ptr %x.addr, align 4
@@ -35,3 +42,28 @@ entry:
   %conv = sext i8 %2 to i32
   ret i32 %conv
 }
+
+define i32 @ctz_nonarraygep(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz_nonarraygep(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %x.addr, align 4
+  %sub = sub i32 0, %1
+  %and = and i32 %0, %sub
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz1.table, i64 %idxprom
+  %2 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %2 to i32
+  ret i32 %conv
+}