Get i128 working too

davemgreen · davemgreen · commit cfa67d889f52 · 2025-07-31T10:04:57.000+01:00
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -458,14 +458,15 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
 // the numbers from 0 to \p InputBits that should represent cttz results.
-static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
-                        uint64_t AndMask, Type *AccessTy, unsigned InputBits,
-                        APInt GEPIdxFactor, const DataLayout &DL) {
+static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
+                        const APInt &AndMask, Type *AccessTy,
+                        unsigned InputBits, const APInt &GEPIdxFactor,
+                        const DataLayout &DL) {
   for (unsigned Idx = 0; Idx < InputBits; Idx++) {
-    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift) & AndMask;
+    APInt Index = (APInt(InputBits, 1).shl(Idx) * Mul).lshr(Shift) & AndMask;
     ConstantInt *C = dyn_cast_or_null<ConstantInt>(
         ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
-    if (!C || C->getZExtValue() != Idx)
+    if (!C || C->getValue() != Idx)
       return false;
   }
 
@@ -485,7 +486,7 @@ static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
 // There is also a special case when the element is 0.
 //
 // The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
-// sequence that contains each patterns of bits in it. The shift extracts
+// sequence that contains each pattern of bits in it. The shift extracts
 // the top bits after the multiply, and that index into the table should
 // represent the number of trailing zeros in the original number.
 //
@@ -557,27 +558,26 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   auto [GepIdx, GEPScale] = VarOffsets.front();
 
   Value *X1;
-  uint64_t MulConst, ShiftConst, AndCst = ~0ull;
+  const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
   // Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
   // This might be extended to the pointer index type, and if the gep index type
   // has been replaced with an i8 then a new And (and different ShiftConst) will
   // be present.
-  // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
-  // probably fail for other (e.g. 32-bit) targets.
-  auto MatchInner = m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
-                                 m_ConstantInt(MulConst)),
-                           m_ConstantInt(ShiftConst));
-  if (!match(GepIdx, m_ZExtOrSelf(MatchInner)) &&
-      !match(GepIdx, m_ZExtOrSelf(m_And(MatchInner, m_ConstantInt(AndCst)))))
+  auto MatchInner = m_LShr(
+      m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), m_APInt(MulConst)),
+      m_APInt(ShiftConst));
+  if (!match(GepIdx, m_CastOrSelf(MatchInner)) &&
+      !match(GepIdx, m_CastOrSelf(m_And(MatchInner, m_APInt(AndCst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
-  if (InputBits != 16 && InputBits != 32 && InputBits != 64)
+  if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
     return false;
 
   if (!GEPScale.isIntN(InputBits) ||
-      !isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AndCst,
-                   AccessType, InputBits, GEPScale.trunc(InputBits), DL))
+      !isCTTZTable(GVTable->getInitializer(), *MulConst, *ShiftConst,
+                   AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
+                   InputBits, GEPScale.zextOrTrunc(InputBits), DL))
     return false;
 
   ConstantInt *ZeroTableElem = cast<ConstantInt>(
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -452,3 +452,49 @@ entry:
   %conv6 = zext i8 %1 to i32
   ret i32 %conv6
 }
+
+; Same as ctz1 but the table and load is very large
+@ctz7i128.table = internal unnamed_addr constant [32 x i128] [i128 0, i128 1, i128 28, i128 2, i128 29, i128 14, i128 24, i128 3, i128 30, i128 22, i128 20, i128 15, i128 25, i128 17, i128 4, i128 8, i128 31, i128 27, i128 13, i128 23, i128 21, i128 19, i128 16, i128 7, i128 26, i128 12, i128 18, i128 6, i128 11, i128 5, i128 10, i128 9], align 16
+define i128 @ctz1_i128(i32 %x) {
+; CHECK-LABEL: @ctz1_i128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i128
+; CHECK-NEXT:    ret i128 [[TMP3]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [32 x i128], ptr @ctz7i128.table, i64 0, i64 %idxprom
+  %l = load i128, ptr %arrayidx, align 1
+  ret i128 %l
+}
+
+; This is roughly the same as ctz1 but using i128.
+@table.i128 = internal unnamed_addr constant [128 x i8] c"\00\01e\02tf<\03|ug^R=!\04}yvWoh_5ZSE>0\22\14\05~rzPwmX.pkiI`K6\1Ab[TBMF?'81*#\1C\15\0E\06\7Fds;{]Q xVn4YD/\13qOl-jHJ\19aAL&7)\1B\0Dc:\\\1FU3C\12N,G\18@%(\0C9\1E2\11+\17$\0B\1D\10\16\0A\0F\09\08\07", align 1
+define i32 @src(i128 noundef %x) {
+; CHECK-LABEL: @src(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i128 @llvm.cttz.i128(i128 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i128 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i128 0, i128 [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i128 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i128 0, %x
+  %and = and i128 %x, %sub
+  %mul = mul i128 %and, 2647824804797170443043024478319300753
+  %shr = lshr i128 %mul, 121
+  %idxprom = trunc i128 %shr to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @table.i128, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}