Use ConstantFoldLoadFromConst, and get i16 tables working as a result

davemgreen · davemgreen · commit bb524c5bdfe9 · 2025-07-29T14:04:44.000+01:00
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -457,30 +457,20 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
-// the numbers from 0 to \p InputBits that should represent cttz results.
-static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
-                        uint64_t Shift, uint64_t InputBits) {
-  unsigned Length = Table.getNumElements();
-  if (Length < InputBits || Length > InputBits * 2)
-    return false;
-
-  APInt Mask = APInt::getBitsSetFrom(InputBits, Shift);
-  unsigned Matched = 0;
-
-  for (unsigned i = 0; i < Length; i++) {
-    uint64_t Element = Table.getElementAsInteger(i);
-    if (Element >= InputBits)
-      continue;
-
-    // Check if \p Element matches a concrete answer. It could fail for some
-    // elements that are never accessed, so we keep iterating over each element
-    // from the table. The number of matched elements should be equal to the
-    // number of potential right answers which is \p InputBits actually.
-    if ((((Mul << Element) & Mask.getZExtValue()) >> Shift) == i)
-      Matched++;
+// the numbers from 0 to \p InputTy->getSizeInBits() that should represent cttz
+// results.
+static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
+                        Type *AccessTy, unsigned InputBits,
+                        unsigned GEPIdxFactor, const DataLayout &DL) {
+  for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift);
+    ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+        ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+    if (!C || C->getZExtValue() != Idx)
+      return false;
   }
 
-  return Matched == InputBits;
+  return true;
 }
 
 // Try to recognize table-based ctz implementation.
@@ -537,7 +527,7 @@ static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
 // %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
 //
 // All this can be lowered to @llvm.cttz.i32/64 intrinsic.
-static bool tryToRecognizeTableBasedCttz(Instruction &I) {
+static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   LoadInst *LI = dyn_cast<LoadInst>(&I);
   if (!LI)
     return false;
@@ -567,11 +557,6 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
   if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
     return false;
 
-  ConstantDataArray *ConstData =
-      dyn_cast<ConstantDataArray>(GVTable->getInitializer());
-  if (!ConstData || ConstData->getElementType() != GEPSrcEltTy)
-    return false;
-
   Value *X1;
   uint64_t MulConst, ShiftConst;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
@@ -583,19 +568,21 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
-  if (InputBits != 32 && InputBits != 64)
+  if (InputBits != 16 && InputBits != 32 && InputBits != 64)
     return false;
 
-  // Shift should extract top 5..7 bits.
+  // Shift should extract top 4..7 bits.
   if (InputBits - Log2_32(InputBits) != ShiftConst &&
       InputBits - Log2_32(InputBits) - 1 != ShiftConst)
     return false;
 
-  if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits))
+  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AccessType,
+                   InputBits, GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
     return false;
 
-  auto ZeroTableElem = ConstData->getElementAsInteger(0);
-  bool DefinedForZero = ZeroTableElem == InputBits;
+  ConstantInt *ZeroTableElem = cast<ConstantInt>(
+      ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+  bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
 
   IRBuilder<> B(LI);
   ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
@@ -609,8 +596,7 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     // If the value in elem 0 isn't the same as InputBits, we still want to
     // produce the value from the table.
     auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
-    auto Select =
-        B.CreateSelect(Cmp, ConstantInt::get(XType, ZeroTableElem), Cttz);
+    auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
 
     // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
     // it should be handled as: `cttz(x) & (typeSize - 1)`.
@@ -1479,7 +1465,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= foldGuardedFunnelShift(I, DT);
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
-      MadeChange |= tryToRecognizeTableBasedCttz(I);
+      MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -299,6 +299,7 @@ entry:
   ret i32 %conv
 }
 
+; This is the same a ctz2 (i16 table) with an i8 gep making the indices invalid
 define i32 @ctz2_with_i8_gep(i32 %x) {
 ; CHECK-LABEL: @ctz2_with_i8_gep(
 ; CHECK-NEXT:  entry:
@@ -308,7 +309,7 @@ define i32 @ctz2_with_i8_gep(i32 %x) {
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 26
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -319,7 +320,84 @@ entry:
   %shr = lshr i32 %mul, 26
   %idxprom = zext i32 %shr to i64
   %arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
-  %0 = load i16, ptr %arrayidx, align 2
+  %0 = load i16, ptr %arrayidx, align 1
   %conv = sext i16 %0 to i32
   ret i32 %conv
 }
+
+; This is the same a ctz2_with_i8_gep but with the gep index multiplied by 2.
+define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
+; CHECK-LABEL: @ctz2_with_i8_gep_fixed(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 25
+; CHECK-NEXT:    [[SHR2:%.*]] = and i32 [[SHR]], 126
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SHR2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %sub = sub i32 0, %x
+  %and = and i32 %x, %sub
+  %mul = mul i32 %and, 72416175
+  %shr = lshr i32 %mul, 25
+  %shr2 = and i32 %shr, 126
+  %1 = zext nneg i32 %shr2 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 %1
+  %2 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %2 to i32
+  ret i32 %conv
+}
+
+; This is a i16 input with the debruijn table stored in a single i128.
+@tablei128 = internal unnamed_addr constant i128 16018378897745984667142067713738932480, align 16
+define i32 @cttz_i16_via_i128(i16 noundef %x) {
+; CHECK-LABEL: @cttz_i16_via_i128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i16 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[CONV6]]
+;
+entry:
+  %sub = sub i16 0, %x
+  %and = and i16 %x, %sub
+  %mul = mul i16 %and, 2479
+  %0 = lshr i16 %mul, 12
+  %idxprom = zext nneg i16 %0 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @tablei128, i64 %idxprom
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv6 = zext i8 %1 to i32
+  ret i32 %conv6
+}
+
+; Same as above but the table is a little off
+@tablei128b = internal unnamed_addr constant i128 16018378897745984667142068813250560256, align 16
+define i32 @cttz_i16_via_i128_incorrecttable(i16 noundef %x) {
+; CHECK-LABEL: @cttz_i16_via_i128_incorrecttable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i16 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], [[SUB]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[AND]], 2479
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[MUL]], 12
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @tablei128b, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV6]]
+;
+entry:
+  %sub = sub i16 0, %x
+  %and = and i16 %x, %sub
+  %mul = mul i16 %and, 2479
+  %0 = lshr i16 %mul, 12
+  %idxprom = zext nneg i16 %0 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @tablei128b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv6 = zext i8 %1 to i32
+  ret i32 %conv6
+}