[LV] Peek through bitcasts when performing CSE #146856

pedroclobo · 2025-07-03T11:04:12Z

LoopVectorize performs CSE of induction variable instructions. Add bitcasts to the worklist as well.

llvmbot · 2025-07-03T11:04:47Z

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Pedro Lobo (pedroclobo)

Changes

LoopVectorize performs CSE of induction variable instructions. Add bitcasts to the worklist as well.

Full diff: https://github.com/llvm/llvm-project/pull/146856.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+2-1)
(added) llvm/test/Transforms/LoopVectorize/bitcast-cse.ll (+71)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 67df7a8af098d..81f859b903435 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2643,7 +2643,8 @@ namespace {
 struct CSEDenseMapInfo {
   static bool canHandle(const Instruction *I) {
     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
-           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I) ||
+           isa<BitCastInst>(I);
   }
 
   static inline Instruction *getEmptyKey() {
diff --git a/llvm/test/Transforms/LoopVectorize/bitcast-cse.ll b/llvm/test/Transforms/LoopVectorize/bitcast-cse.ll
new file mode 100644
index 0000000000000..698a0d071acfe
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/bitcast-cse.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt %s -passes=loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @bitcast-cse(i16 %0) {
+; CHECK-LABEL: define i32 @bitcast-cse(
+; CHECK-SAME: i16 [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr null, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr null, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[BROADCAST_SPLAT]] to <4 x half>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    store <8 x half> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP1]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[FOR_END_LOOPEXIT909:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ null, %[[MIDDLE_BLOCK]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[DO_BODY93_I705:.*]]
+; CHECK:       [[DO_BODY93_I705]]:
+; CHECK-NEXT:    [[DEST_10_I706:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD_PTR97_I709:%.*]], %[[DO_BODY93_I705]] ]
+; CHECK-NEXT:    [[LEN_ADDR_8_I707:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[SUB99_I710:%.*]], %[[DO_BODY93_I705]] ]
+; CHECK-NEXT:    store i16 [[TMP0]], ptr [[DEST_10_I706]], align 1
+; CHECK-NEXT:    [[ARRAYIDX96_I708:%.*]] = getelementptr i16, ptr [[DEST_10_I706]], i64 1
+; CHECK-NEXT:    store half 0xH0000, ptr [[ARRAYIDX96_I708]], align 1
+; CHECK-NEXT:    [[ADD_PTR97_I709]] = getelementptr i16, ptr [[DEST_10_I706]], i64 2
+; CHECK-NEXT:    [[SUB99_I710]] = add i64 [[LEN_ADDR_8_I707]], -2
+; CHECK-NEXT:    [[TOBOOL100_NOT_I711:%.*]] = icmp eq i64 [[SUB99_I710]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL100_NOT_I711]], label %[[FOR_END_LOOPEXIT909]], label %[[DO_BODY93_I705]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT909]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %do.body93.i705
+
+do.body93.i705:
+  %dest.10.i706 = phi ptr [ null, %entry ], [ %add.ptr97.i709, %do.body93.i705 ]
+  %len.addr.8.i707 = phi i64 [ 0, %entry ], [ %sub99.i710, %do.body93.i705 ]
+  store i16 %0, ptr %dest.10.i706, align 1
+  %arrayidx96.i708 = getelementptr i16, ptr %dest.10.i706, i64 1
+  store half 0.0, ptr %arrayidx96.i708, align 1
+  %add.ptr97.i709 = getelementptr i16, ptr %dest.10.i706, i64 2
+  %sub99.i710 = add i64 %len.addr.8.i707, -2
+  %tobool100.not.i711 = icmp eq i64 %sub99.i710, 0
+  br i1 %tobool100.not.i711, label %for.end.loopexit909, label %do.body93.i705
+
+for.end.loopexit909:
+  ret i32 0
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

pedroclobo · 2025-07-18T15:36:39Z

This patch fixes a regression in the generated IR by the prototype introducing the byte type, which blocked the following shuffle + bitcast, in the byte type version, from getting CSE'd. The prototype was developed as part of this GSoC project.

; base version of clang
vector.ph926: 
  %n.vec928 = and i64 %53, -16
  %broadcast.splatinsert929 = insertelement <8 x i8> poison, i8 %50, i64 0   
  %broadcast.splatinsert931 = insertelement <8 x i8> poison, i8 %49, i64 0   
  %invariant.gep982 = getelementptr i8, ptr %dest.6.1660, i64 16
  %interleaved.vec937 = shufflevector <8 x i8> %broadcast.splatinsert931, <8 x i8> %broadcast.splatinsert929, ...
  br label %vector.body933

; clang with byte type
vector.ph942:
  %n.vec944 = and i64 %53, -16 
  %broadcast.splatinsert945 = insertelement <8 x b8> poison, b8 %50, i64 0
  %broadcast.splatinsert947 = insertelement <8 x i8> poison, i8 %49, i64 0
  %broadcast.splat948 = shufflevector <8 x i8> %broadcast.splatinsert947, <8 x i8> poison, <8 x i32> zeroinitializer 
  %invariant.gep998 = getelementptr i8, ptr %dest.6.1676, i64 16
  %54 bitcast <8 x i8> %broadcast.splat948 to <8 x b8>
  %interleaved.vec953 = shufflevector <8 x b8> %54, <8 x b8> %broadcast.splatinsert945, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, …>
  %55 = bitcast <8 x i8> %broadcast.splat948 to <8 x b8> 
  %interleaved.vec954 = shufflevector <8 x b8> %55, <8 x b8> %broadcast.splatinsert945, <16 x i32> <i32 8, i32 8, i32 1, i32 8, i32 2, i32 8, …>
  %next.gep935 = getelementptr i8, ptr %dest.6.1668, i64 %54
  br label %vector.body949

The following original reduced test case was adapted to not include the byte type.

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define fastcc i32 @Decompress_Sequences(i8 %0) {
entry:
  br label %do.body93.i705

do.body93.i705:                                   ; preds = %do.body93.i705, %entry
  %dest.10.i706 = phi ptr [ null, %entry ], [ %add.ptr97.i709, %do.body93.i705 ]
  %len.addr.8.i707 = phi i64 [ 0, %entry ], [ %sub99.i710, %do.body93.i705 ]
  store i8 %0, ptr %dest.10.i706, align 1
  %arrayidx96.i708 = getelementptr i8, ptr %dest.10.i706, i64 1
  store b8 0, ptr %arrayidx96.i708, align 1
  %add.ptr97.i709 = getelementptr i8, ptr %dest.10.i706, i64 2
  %sub99.i710 = add i64 %len.addr.8.i707, -2
  %tobool100.not.i711 = icmp eq i64 %sub99.i710, 0
  br i1 %tobool100.not.i711, label %for.end.loopexit909, label %do.body93.i705

for.end.loopexit909:                              ; preds = %do.body93.i705
  ret i32 0
}

The patch does not seem to have any significant impact on compile time, as reported by the compile time tracker.

artagnon

Kindly clean up the test with human-readable names.

llvm/test/Transforms/LoopVectorize/bitcast-cse.ll

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/bitcast-cse.ll

llvm/test/Transforms/LoopVectorize/X86/bitcast-cse.ll

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/X86/bitcast-cse.ll

Requires llvm#151487 to completely subsume the non-VPlan based limited CSE. Inspired by llvm#146856, although the test from that PR remains unchanged: still investigating.

artagnon · 2025-08-03T15:57:34Z

I had the chance to try to the VPlan-based CSE I was talking about,: #151872.

artagnon

LGTM, thanks! Might be good to wait a day or two, to give others also a chance to comment.

Requires llvm#151487 to completely subsume the non-VPlan based limited CSE. Inspired by llvm#146856, although the test from that PR remains unchanged: still investigating.

fhahn

Does this have any real-world impact with the byte type?

I'm curious because the case here looks very spefic: we need an interleave group, which stores an invariant value with the vector loop unrolled.

This isn't a case we can easily catch with VPlan CSE, because it is part of VPInterleaveRecipe::execute, and can't easily be broken out.

This will make it harder to replace the legacy CSE with VPlan-based CSE, and I don't think we should keep both around in parallel.

llvm/test/Transforms/LoopVectorize/X86/cse-bitcast.ll

LoopVectorize performs CSE of induction variable instructions. Adds bitcasts to the worklist as well.

pedroclobo · 2025-08-04T15:50:19Z

I'm not really sure about the consequences of the legacy CSE to VPlan-based CSE refactoring.
I think @artagnon is more knowledgeable on that matter.

The byte type only introduced an additional bitcast from an i8 to b8, which blocked the bitcast + shufflevector CSE. Before introducing the byte type, the store was over an integer type, so there was no need for a bitcast. As such, the CSE was able to eliminate the redundant shufflevector.

This pattern was found when diagnosing an assembly regression in 7zip.

artagnon · 2025-08-04T16:00:01Z

I'm not really sure about the consequences of the legacy CSE to VPlan-based CSE refactoring. I think @artagnon is more knowledgeable on that matter.

I think @fhahn is right, and it might be very difficult (or impossible) to get this bitcasting functionality present in the legacy cse in the VPlan-based one, although the same might be the case for several shuffles.

artagnon · 2025-08-04T21:32:15Z

This isn't a case we can easily catch with VPlan CSE, because it is part of VPInterleaveRecipe::execute, and can't easily be broken out.

I had a look at this, and I think the VPlan-based cse is going to be around with the legacy cse in the medium-term. Yes, breaking out VPInterleaveRecipe::execute seems hard, but we ultimately want to do this if we have to replace the legacy cse (VPInterleaveRecipe::execute also produces various shuffles): as @pedroclobo's patch has real-world impact and doesn't change the nature of the problem of deprecating the legacy cse, I think we should strive to get it merged, so we have the test case for the VPlan-based cse.

fhahn · 2025-08-05T10:38:26Z

I'm not really sure about the consequences of the legacy CSE to VPlan-based CSE refactoring. I think @artagnon is more knowledgeable on that matter.

The byte type only introduced an additional bitcast from an i8 to b8, which blocked the bitcast + shufflevector CSE. Before introducing the byte type, the store was over an integer type, so there was no need for a bitcast. As such, the CSE was able to eliminate the redundant shufflevector.

This pattern was found when diagnosing an assembly regression in 7zip.

What was the diff?

I'd expect the invariant shuffle + bit cast to be hoisted out. For the test case, it gets hoisted out of the vector loop for both AArch64 (https://llvm.godbolt.org/z/b9svxjzxM) and X86 (https://llvm.godbolt.org/z/v57zTa4ox)

pedroclobo · 2025-08-05T16:32:42Z

This pattern was found when diagnosing an assembly regression in 7zip.

Sorry, I meant IR, not assembly. The assembly regression associated with this particular test had another root cause.
Sorry for the mix-up.

Despite generating identical assembly, I added the bitcast to the CSE worklist in order to avoid generated IR differences between upstream Clang and the prototype introducing the byte type.

I opened this PR as I thought it may be beneficial to upstream this change, although this invariant shufflevector + bitcast pattern might be somewhat rare (among the servers I could test this on, it only showed up on one with an older Intel CPU).

llvmbot added vectorizers llvm:transforms labels Jul 3, 2025

pedroclobo requested a review from fhahn July 18, 2025 15:38

pedroclobo requested a review from artagnon August 2, 2025 11:41

artagnon reviewed Aug 2, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp Outdated Show resolved Hide resolved

pedroclobo requested a review from artagnon August 2, 2025 22:15

artagnon reviewed Aug 3, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/bitcast-cse.ll Outdated Show resolved Hide resolved

artagnon requested a review from lukel97 August 3, 2025 08:42

artagnon changed the title ~~[LoopVectorize] Peek through bitcasts when performing CSE~~ [LV] Peek through bitcasts when performing CSE Aug 3, 2025

artagnon reviewed Aug 3, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/X86/bitcast-cse.ll Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp Show resolved Hide resolved

artagnon reviewed Aug 3, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/X86/bitcast-cse.ll Outdated Show resolved Hide resolved

artagnon reviewed Aug 3, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/X86/bitcast-cse.ll Outdated Show resolved Hide resolved

artagnon mentioned this pull request Aug 3, 2025

[VPlan] Introduce CSE pass #151872

Open

pedroclobo force-pushed the bitcast-cse branch from 30df849 to e93bc65 Compare August 3, 2025 17:12

artagnon approved these changes Aug 3, 2025

View reviewed changes

fhahn reviewed Aug 4, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/X86/cse-bitcast.ll Outdated Show resolved Hide resolved

pedroclobo added 2 commits August 4, 2025 16:30

[LV] Pre-commit test for cse_bitcast

53c4a23

[LV] Peek through bitcasts when performing CSE

a94c43f

LoopVectorize performs CSE of induction variable instructions. Adds bitcasts to the worklist as well.

pedroclobo force-pushed the bitcast-cse branch from e93bc65 to a94c43f Compare August 4, 2025 15:31

[LV] Peek through bitcasts when performing CSE #146856

Are you sure you want to change the base?

[LV] Peek through bitcasts when performing CSE #146856

Uh oh!

Conversation

pedroclobo commented Jul 3, 2025

Uh oh!

llvmbot commented Jul 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

pedroclobo commented Jul 18, 2025

Uh oh!

artagnon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

artagnon commented Aug 3, 2025

Uh oh!

artagnon left a comment

Choose a reason for hiding this comment

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

pedroclobo commented Aug 4, 2025

Uh oh!

artagnon commented Aug 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

artagnon commented Aug 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

fhahn commented Aug 5, 2025

Uh oh!

pedroclobo commented Aug 5, 2025

Uh oh!

Uh oh!

llvmbot commented Jul 3, 2025 •

edited

Loading

artagnon commented Aug 4, 2025 •

edited

Loading

artagnon commented Aug 4, 2025 •

edited

Loading