From fe1ef332c23d8ce858e41104352d68b786342cf7 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 15 Apr 2025 07:45:46 +0100 Subject: [PATCH 01/14] [X86][AVX] Match v4f64 blend from shuffle of scalar values. Convert a BUILD_VECTOR of scalar values to a shuffle of shuffles that will lower to AVX blend. --- llvm/test/CodeGen/X86/shuffle-blendw.ll | 422 ++++++++++++++++++++++++ 1 file changed, 422 insertions(+) diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll index 9f90657dc64d1..28af382ec3e07 100644 --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -263,3 +263,425 @@ define <8 x i16> @blendw_to_blendd_fail_16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32> ret <8 x i16> %shuffle } + +define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { +; X86-SSE41-LABEL: blend_broadcasts_v4f64: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movaps (%ecx), %xmm2 +; X86-SSE41-NEXT: movaps (%eax), %xmm1 +; X86-SSE41-NEXT: movaps %xmm2, %xmm0 +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE41-NEXT: retl +; +; X64-SSE41-LABEL: blend_broadcasts_v4f64: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movaps (%rdi), %xmm2 +; X64-SSE41-NEXT: movaps (%rsi), %xmm1 +; X64-SSE41-NEXT: movaps %xmm2, %xmm0 +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE41-NEXT: retq +; +; X86-AVX-LABEL: blend_broadcasts_v4f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: blend_broadcasts_v4f64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm1 +; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; X64-AVX-NEXT: retq +; +; X86-AVX2-LABEL: blend_broadcasts_v4f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: blend_broadcasts_v4f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm1 +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; X64-AVX2-NEXT: retq +; +; X86-AVX512-LABEL: blend_broadcasts_v4f64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: blend_broadcasts_v4f64: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm1 +; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; X64-AVX512-NEXT: retq + %ld0 = load <4 x double>, ptr %p0, align 32 + %ld1 = load <4 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer + %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer + %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) { +; X86-SSE41-LABEL: blend_broadcasts_v2f64: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movaps (%ecx), %xmm2 +; X86-SSE41-NEXT: movaps (%eax), %xmm1 +; X86-SSE41-NEXT: movaps %xmm2, %xmm0 +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE41-NEXT: retl +; +; X64-SSE41-LABEL: blend_broadcasts_v2f64: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movaps (%rdi), %xmm2 +; X64-SSE41-NEXT: movaps (%rsi), %xmm1 +; X64-SSE41-NEXT: movaps %xmm2, %xmm0 +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE41-NEXT: retq +; +; X86-AVX-LABEL: blend_broadcasts_v2f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X86-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: blend_broadcasts_v2f64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; X64-AVX-NEXT: retq +; +; X86-AVX2-LABEL: blend_broadcasts_v2f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X86-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: blend_broadcasts_v2f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; X64-AVX2-NEXT: retq +; +; X86-AVX512-LABEL: blend_broadcasts_v2f64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2] +; X86-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: blend_broadcasts_v2f64: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2] +; X64-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 +; X64-AVX512-NEXT: retq + %ld0 = load <2 x double>, ptr %p0, align 32 + %ld1 = load <2 x double>, ptr %p1, align 32 + %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { +; X86-SSE41-LABEL: blend_broadcasts_v1f64: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X86-SSE41-NEXT: movaps %xmm2, %xmm0 +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE41-NEXT: retl +; +; X64-SSE41-LABEL: blend_broadcasts_v1f64: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X64-SSE41-NEXT: movaps %xmm2, %xmm0 +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE41-NEXT: retq +; +; X86-AVX-LABEL: blend_broadcasts_v1f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: blend_broadcasts_v1f64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X86-AVX2-LABEL: blend_broadcasts_v1f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: blend_broadcasts_v1f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X86-AVX512-LABEL: blend_broadcasts_v1f64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: blend_broadcasts_v1f64: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { +; X86-SSE41-LABEL: blend_broadcasts_v1f64_4x: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X86-SSE41-NEXT: movaps %xmm2, %xmm0 +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE41-NEXT: retl +; +; X64-SSE41-LABEL: blend_broadcasts_v1f64_4x: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X64-SSE41-NEXT: movaps %xmm2, %xmm0 +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE41-NEXT: retq +; +; X86-AVX-LABEL: blend_broadcasts_v1f64_4x: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: blend_broadcasts_v1f64_4x: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer + %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { +; X86-SSE41-LABEL: blend_broadcasts_v1f64_2x: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X86-SSE41-NEXT: movaps %xmm2, %xmm0 +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE41-NEXT: retl +; +; X64-SSE41-LABEL: blend_broadcasts_v1f64_2x: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X64-SSE41-NEXT: movaps %xmm2, %xmm0 +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE41-NEXT: retq +; +; X86-AVX-LABEL: blend_broadcasts_v1f64_2x: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: blend_broadcasts_v1f64_2x: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX-NEXT: retq +; +; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer + %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} From 20e90d04e1ae2ad604f769f6f95b5623ebe046b8 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 15 Apr 2025 07:59:42 +0100 Subject: [PATCH 02/14] Add lowering code and update tests. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 33 ++++++ llvm/test/CodeGen/X86/shuffle-blendw.ll | 144 +++++++++--------------- 2 files changed, 87 insertions(+), 90 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cd1bbb8fbb7b7..f673266fb89db 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9133,6 +9133,39 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MVT OpEltVT = Op.getOperand(0).getSimpleValueType(); unsigned NumElems = Op.getNumOperands(); + // Match BUILD_VECTOR of scalars that we can lower to X86ISD::BLENDI via + // shuffles. + // + // v4f64 = BUILD_VECTOR X,Y,Y,X + // >>> + // t1: v4f64 = BUILD_VECTOR X,u,u,u + // t3: v4f64 = vector_shuffle<0,u,u,0> t1, u + // t2: v4f64 = BUILD_VECTOR Y,u,u,u + // t4: v4f64 = vector_shuffle t2, u + // v4f64 = vector_shuffle<0,5,6,3> t3, t4 + // + if (Subtarget.hasAVX() && VT == MVT::v4f64 && Op->getNumOperands() == 4u) { + auto Op0 = Op->getOperand(0u); + auto Op1 = Op->getOperand(1u); + auto Op2 = Op->getOperand(2u); + auto Op3 = Op->getOperand(3u); + + // Match X,Y,Y,X inputs. + if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) { + auto PsnVal = DAG.getUNDEF(MVT::f64); + + auto NewOp0 = DAG.getBuildVector(VT, dl, {Op0, PsnVal, PsnVal, PsnVal}); + NewOp0 = DAG.getVectorShuffle(VT, dl, NewOp0, DAG.getUNDEF(VT), + {0, -1, -1, 0}); + + auto NewOp1 = DAG.getBuildVector(VT, dl, {Op1, PsnVal, PsnVal, PsnVal}); + NewOp1 = DAG.getVectorShuffle(VT, dl, NewOp1, DAG.getUNDEF(VT), + {-1, 0, 0, -1}); + + return DAG.getVectorShuffle(VT, dl, NewOp0, NewOp1, {0, 5, 6, 3}); + } + } + // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget); diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll index 28af382ec3e07..a1af29550f64f 100644 --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -449,60 +449,48 @@ define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: blend_broadcasts_v1f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX-NEXT: retq ; ; X86-AVX2-LABEL: blend_broadcasts_v1f64: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: blend_broadcasts_v1f64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: blend_broadcasts_v1f64: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blend_broadcasts_v1f64: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX512-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 @@ -535,60 +523,48 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: blend_broadcasts_v1f64_4x: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX-NEXT: retq ; ; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX512-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 @@ -623,60 +599,48 @@ define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: blend_broadcasts_v1f64_2x: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX-NEXT: retq ; ; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X86-AVX512-NEXT: retl ; ; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0 +; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; X64-AVX512-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 From 3d9918caab531d5e636ca5e9e3389353850e2f7c Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 15 Apr 2025 08:25:03 +0100 Subject: [PATCH 03/14] Replace undef in tests. --- llvm/test/CodeGen/X86/shuffle-blendw.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll index a1af29550f64f..20239362d2480 100644 --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -334,8 +334,8 @@ define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { ; X64-AVX512-NEXT: retq %ld0 = load <4 x double>, ptr %p0, align 32 %ld1 = load <4 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer - %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer + %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer + %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> ret <4 x double> %blend } @@ -568,8 +568,8 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { ; X64-AVX512-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer - %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> ret <4 x double> %blend } @@ -644,8 +644,8 @@ define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { ; X64-AVX512-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer - %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> ret <4 x double> %blend } From 5828e0222b1d55a9feab0ba4700c4b34c8f92ba8 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 16 Apr 2025 06:43:27 +0100 Subject: [PATCH 04/14] Address comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 62 ++- llvm/test/CodeGen/X86/shuffle-blendw.ll | 386 ------------------ .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 81 ++++ 3 files changed, 110 insertions(+), 419 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f673266fb89db..43a56a67d82c1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8783,6 +8783,33 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, return LowerShift(Res, Subtarget, DAG); } +/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats +/// representing a blend. +static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, + X86Subtarget const &Subtarget, + SelectionDAG &DAG) { + if (!Subtarget.hasAVX()) + return {}; + + auto VT = BVOp->getSimpleValueType(0u); + + if (VT == MVT::v4f64 && BVOp->getNumOperands() == 4u) { + SDValue Op0 = BVOp->getOperand(0u); + SDValue Op1 = BVOp->getOperand(1u); + SDValue Op2 = BVOp->getOperand(2u); + SDValue Op3 = BVOp->getOperand(3u); + + // Match X,Y,Y,X inputs. + if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) { + auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0); + auto NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1); + return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, {0, 5, 6, 3}); + } + } + + return {}; +} + /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. @@ -9133,39 +9160,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MVT OpEltVT = Op.getOperand(0).getSimpleValueType(); unsigned NumElems = Op.getNumOperands(); - // Match BUILD_VECTOR of scalars that we can lower to X86ISD::BLENDI via - // shuffles. - // - // v4f64 = BUILD_VECTOR X,Y,Y,X - // >>> - // t1: v4f64 = BUILD_VECTOR X,u,u,u - // t3: v4f64 = vector_shuffle<0,u,u,0> t1, u - // t2: v4f64 = BUILD_VECTOR Y,u,u,u - // t4: v4f64 = vector_shuffle t2, u - // v4f64 = vector_shuffle<0,5,6,3> t3, t4 - // - if (Subtarget.hasAVX() && VT == MVT::v4f64 && Op->getNumOperands() == 4u) { - auto Op0 = Op->getOperand(0u); - auto Op1 = Op->getOperand(1u); - auto Op2 = Op->getOperand(2u); - auto Op3 = Op->getOperand(3u); - - // Match X,Y,Y,X inputs. - if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) { - auto PsnVal = DAG.getUNDEF(MVT::f64); - - auto NewOp0 = DAG.getBuildVector(VT, dl, {Op0, PsnVal, PsnVal, PsnVal}); - NewOp0 = DAG.getVectorShuffle(VT, dl, NewOp0, DAG.getUNDEF(VT), - {0, -1, -1, 0}); - - auto NewOp1 = DAG.getBuildVector(VT, dl, {Op1, PsnVal, PsnVal, PsnVal}); - NewOp1 = DAG.getVectorShuffle(VT, dl, NewOp1, DAG.getUNDEF(VT), - {-1, 0, 0, -1}); - - return DAG.getVectorShuffle(VT, dl, NewOp0, NewOp1, {0, 5, 6, 3}); - } - } - // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget); @@ -9278,6 +9272,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG)) return BitOp; + if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG)) + return Blend; unsigned NumZero = ZeroMask.popcount(); unsigned NumNonZero = NonZeroMask.popcount(); diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll index 20239362d2480..9f90657dc64d1 100644 --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -263,389 +263,3 @@ define <8 x i16> @blendw_to_blendd_fail_16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32> ret <8 x i16> %shuffle } - -define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { -; X86-SSE41-LABEL: blend_broadcasts_v4f64: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movaps (%ecx), %xmm2 -; X86-SSE41-NEXT: movaps (%eax), %xmm1 -; X86-SSE41-NEXT: movaps %xmm2, %xmm0 -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE41-NEXT: retl -; -; X64-SSE41-LABEL: blend_broadcasts_v4f64: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movaps (%rdi), %xmm2 -; X64-SSE41-NEXT: movaps (%rsi), %xmm1 -; X64-SSE41-NEXT: movaps %xmm2, %xmm0 -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-SSE41-NEXT: retq -; -; X86-AVX-LABEL: blend_broadcasts_v4f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; X86-AVX-NEXT: retl -; -; X64-AVX-LABEL: blend_broadcasts_v4f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm1 -; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; X64-AVX-NEXT: retq -; -; X86-AVX2-LABEL: blend_broadcasts_v4f64: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; X86-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: blend_broadcasts_v4f64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm1 -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; X64-AVX2-NEXT: retq -; -; X86-AVX512-LABEL: blend_broadcasts_v4f64: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; X86-AVX512-NEXT: retl -; -; X64-AVX512-LABEL: blend_broadcasts_v4f64: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm1 -; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; X64-AVX512-NEXT: retq - %ld0 = load <4 x double>, ptr %p0, align 32 - %ld1 = load <4 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer - %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer - %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> - ret <4 x double> %blend -} - -define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) { -; X86-SSE41-LABEL: blend_broadcasts_v2f64: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movaps (%ecx), %xmm2 -; X86-SSE41-NEXT: movaps (%eax), %xmm1 -; X86-SSE41-NEXT: movaps %xmm2, %xmm0 -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE41-NEXT: retl -; -; X64-SSE41-LABEL: blend_broadcasts_v2f64: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movaps (%rdi), %xmm2 -; X64-SSE41-NEXT: movaps (%rsi), %xmm1 -; X64-SSE41-NEXT: movaps %xmm2, %xmm0 -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-SSE41-NEXT: retq -; -; X86-AVX-LABEL: blend_broadcasts_v2f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X86-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; X86-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; X86-AVX-NEXT: retl -; -; X64-AVX-LABEL: blend_broadcasts_v2f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X64-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; X64-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; X64-AVX-NEXT: retq -; -; X86-AVX2-LABEL: blend_broadcasts_v2f64: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; X86-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; X86-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: blend_broadcasts_v2f64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; X64-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; X64-AVX2-NEXT: retq -; -; X86-AVX512-LABEL: blend_broadcasts_v2f64: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2] -; X86-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 -; X86-AVX512-NEXT: retl -; -; X64-AVX512-LABEL: blend_broadcasts_v2f64: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2] -; X64-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 -; X64-AVX512-NEXT: retq - %ld0 = load <2 x double>, ptr %p0, align 32 - %ld1 = load <2 x double>, ptr %p1, align 32 - %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> - ret <4 x double> %blend -} - -define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { -; X86-SSE41-LABEL: blend_broadcasts_v1f64: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X86-SSE41-NEXT: movaps %xmm2, %xmm0 -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE41-NEXT: retl -; -; X64-SSE41-LABEL: blend_broadcasts_v1f64: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X64-SSE41-NEXT: movaps %xmm2, %xmm0 -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-SSE41-NEXT: retq -; -; X86-AVX-LABEL: blend_broadcasts_v1f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX-NEXT: retl -; -; X64-AVX-LABEL: blend_broadcasts_v1f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX-NEXT: retq -; -; X86-AVX2-LABEL: blend_broadcasts_v1f64: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: blend_broadcasts_v1f64: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX2-NEXT: retq -; -; X86-AVX512-LABEL: blend_broadcasts_v1f64: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX512-NEXT: retl -; -; X64-AVX512-LABEL: blend_broadcasts_v1f64: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX512-NEXT: retq - %ld0 = load <1 x double>, ptr %p0, align 32 - %ld1 = load <1 x double>, ptr %p1, align 32 - %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> - ret <4 x double> %blend -} - -define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { -; X86-SSE41-LABEL: blend_broadcasts_v1f64_4x: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X86-SSE41-NEXT: movaps %xmm2, %xmm0 -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE41-NEXT: retl -; -; X64-SSE41-LABEL: blend_broadcasts_v1f64_4x: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X64-SSE41-NEXT: movaps %xmm2, %xmm0 -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-SSE41-NEXT: retq -; -; X86-AVX-LABEL: blend_broadcasts_v1f64_4x: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX-NEXT: retl -; -; X64-AVX-LABEL: blend_broadcasts_v1f64_4x: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX-NEXT: retq -; -; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX2-NEXT: retq -; -; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX512-NEXT: retl -; -; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX512-NEXT: retq - %ld0 = load <1 x double>, ptr %p0, align 32 - %ld1 = load <1 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer - %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer - %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> - ret <4 x double> %blend -} - -define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { -; X86-SSE41-LABEL: blend_broadcasts_v1f64_2x: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X86-SSE41-NEXT: movaps %xmm2, %xmm0 -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-SSE41-NEXT: retl -; -; X64-SSE41-LABEL: blend_broadcasts_v1f64_2x: -; X64-SSE41: # %bb.0: -; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; X64-SSE41-NEXT: movaps %xmm2, %xmm0 -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-SSE41-NEXT: retq -; -; X86-AVX-LABEL: blend_broadcasts_v1f64_2x: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX-NEXT: retl -; -; X64-AVX-LABEL: blend_broadcasts_v1f64_2x: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX-NEXT: retq -; -; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX2-NEXT: retq -; -; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X86-AVX512-NEXT: retl -; -; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0 -; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; X64-AVX512-NEXT: retq - %ld0 = load <1 x double>, ptr %p0, align 32 - %ld1 = load <1 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer - %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer - %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> - ret <4 x double> %blend -} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index fb8618be17f06..f74b6867786b1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2367,6 +2367,87 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) { ret <4 x double> %unpckh } +define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { +; ALL-LABEL: blend_broadcasts_v4f64: +; ALL: # %bb.0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: vbroadcastsd (%rsi), %ymm1 +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; ALL-NEXT: retq + %ld0 = load <4 x double>, ptr %p0, align 32 + %ld1 = load <4 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer + %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer + %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) { +; AVX1OR2-LABEL: blend_broadcasts_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: blend_broadcasts_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2] +; AVX512VL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq + %ld0 = load <2 x double>, ptr %p0, align 32 + %ld1 = load <2 x double>, ptr %p1, align 32 + %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { +; ALL-LABEL: blend_broadcasts_v1f64: +; ALL: # %bb.0: +; ALL-NEXT: vbroadcastsd (%rsi), %ymm0 +; ALL-NEXT: vbroadcastsd (%rdi), %ymm1 +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; ALL-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { +; ALL-LABEL: blend_broadcasts_v1f64_4x: +; ALL: # %bb.0: +; ALL-NEXT: vbroadcastsd (%rsi), %ymm0 +; ALL-NEXT: vbroadcastsd (%rdi), %ymm1 +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; ALL-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer + %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} + +define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { +; ALL-LABEL: blend_broadcasts_v1f64_2x: +; ALL: # %bb.0: +; ALL-NEXT: vbroadcastsd (%rsi), %ymm0 +; ALL-NEXT: vbroadcastsd (%rdi), %ymm1 +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; ALL-NEXT: retq + %ld0 = load <1 x double>, ptr %p0, align 32 + %ld1 = load <1 x double>, ptr %p1, align 32 + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer + %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> + ret <4 x double> %blend +} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"ProfileSummary", !1} !1 = !{!2, !3, !4, !5, !6, !7, !8, !9} From 95a75a656b4d7972dbb37b0c00baa8a36a458cdb Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 16 Apr 2025 06:50:20 +0100 Subject: [PATCH 05/14] Remove undef. --- llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index f74b6867786b1..f57287a5ebc62 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2376,8 +2376,8 @@ define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { ; ALL-NEXT: retq %ld0 = load <4 x double>, ptr %p0, align 32 %ld1 = load <4 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer - %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer + %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer + %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> ret <4 x double> %blend } @@ -2427,8 +2427,8 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { ; ALL-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer - %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> ret <4 x double> %blend } @@ -2442,8 +2442,8 @@ define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { ; ALL-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer - %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer + %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer + %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> ret <4 x double> %blend } From 660ccf32e3a5acace8863d923ecf4e2f02bb5b96 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Sat, 26 Apr 2025 06:59:25 +0100 Subject: [PATCH 06/14] Address comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 46 ++++++++++++++++--------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 43a56a67d82c1..c01bf885f5ff3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -21,6 +21,7 @@ #include "X86TargetMachine.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" @@ -37,6 +38,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SDPatternMatch.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallingConv.h" @@ -8788,23 +8790,33 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG) { - if (!Subtarget.hasAVX()) - return {}; - - auto VT = BVOp->getSimpleValueType(0u); - - if (VT == MVT::v4f64 && BVOp->getNumOperands() == 4u) { - SDValue Op0 = BVOp->getOperand(0u); - SDValue Op1 = BVOp->getOperand(1u); - SDValue Op2 = BVOp->getOperand(2u); - SDValue Op3 = BVOp->getOperand(3u); - - // Match X,Y,Y,X inputs. - if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) { - auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0); - auto NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1); - return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, {0, 5, 6, 3}); - } + MVT VT = BVOp->getSimpleValueType(0u); + auto const NumElems = VT.getVectorNumElements(); + + if (Subtarget.hasAVX() && VT == MVT::v4f64) { + // Collect unique operands. + auto UniqueOps = SmallSet(); + for (auto &Op : BVOp->ops()) { + if (isIntOrFPConstant(Op) || Op.get()->isUndef()) + return {}; + UniqueOps.insert(Op); + } + // Candidate BUILD_VECTOR must have 2 unique operands. + if (UniqueOps.size() != 2u) + return {}; + // Create shuffle mask. + auto Op0 = BVOp->getOperand(0u); + auto Mask = std::vector(); + Mask.reserve(NumElems); + for (auto I = 0u; I < NumElems; ++I) { + auto &Op = BVOp->getOperand(I); + Mask.push_back(Op == Op0 ? I : I + NumElems); + } + // Create shuffle of splats. + UniqueOps.erase(Op0); + auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0); + auto NewOp1 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin()); + return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask); } return {}; From 25c54644d854f8f60d2615007fcb0779362cc71d Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Sat, 26 Apr 2025 07:30:40 +0100 Subject: [PATCH 07/14] Update tests. --- llvm/test/CodeGen/X86/build-vector-256.ll | 44 ++++++++++--------- llvm/test/CodeGen/X86/build-vector-512.ll | 36 --------------- .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 38 ---------------- 3 files changed, 24 insertions(+), 94 deletions(-) diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll index 6c1cbfb4014b6..ed00cfe4c32f1 100644 --- a/llvm/test/CodeGen/X86/build-vector-256.ll +++ b/llvm/test/CodeGen/X86/build-vector-256.ll @@ -417,18 +417,26 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) { ; AVX-32-LABEL: test_buildvector_4f64_2_var: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_4f64_2_var: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: retq +; AVX1-64-LABEL: test_buildvector_4f64_2_var: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX1-64-NEXT: retq +; +; AVX2-64-LABEL: test_buildvector_4f64_2_var: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-64-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-64-NEXT: retq %v0 = insertelement <4 x double> poison, double %a0, i32 0 %v1 = insertelement <4 x double> %v0, double %a1, i32 1 %v2 = insertelement <4 x double> %v1, double %a1, i32 2 @@ -441,20 +449,16 @@ define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) { ; AVX-32: # %bb.0: ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-32-NEXT: vbroadcastsd (%ecx), %ymm0 +; AVX-32-NEXT: vbroadcastsd (%eax), %ymm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_4f64_2_load: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX-64-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX-64-NEXT: retq %a0 = load double, ptr %p0 %a1 = load double, ptr %p1 diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll index 5d38f087aa1b3..33493f43fd134 100644 --- a/llvm/test/CodeGen/X86/build-vector-512.ll +++ b/llvm/test/CodeGen/X86/build-vector-512.ll @@ -480,23 +480,6 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; build vectors of repeated elements define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) { -; AVX-32-LABEL: test_buildvector_8f64_2_var: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm1 -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-32-NEXT: retl -; -; AVX-64-LABEL: test_buildvector_8f64_2_var: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-64-NEXT: retq %v0 = insertelement <8 x double> poison, double %a0, i32 0 %v1 = insertelement <8 x double> %v0, double %a1, i32 1 %v2 = insertelement <8 x double> %v1, double %a0, i32 2 @@ -509,25 +492,6 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) { } define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) { -; AVX-32-LABEL: test_buildvector_8f64_2_load: -; AVX-32: # %bb.0: -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1] -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-32-NEXT: retl -; -; AVX-64-LABEL: test_buildvector_8f64_2_load: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-64-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1] -; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX-64-NEXT: retq %a0 = load double, ptr %p0 %a1 = load double, ptr %p1 %v0 = insertelement <8 x double> poison, double %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index f57287a5ebc62..a746f3528a050 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2367,44 +2367,6 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) { ret <4 x double> %unpckh } -define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { -; ALL-LABEL: blend_broadcasts_v4f64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 -; ALL-NEXT: vbroadcastsd (%rsi), %ymm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; ALL-NEXT: retq - %ld0 = load <4 x double>, ptr %p0, align 32 - %ld1 = load <4 x double>, ptr %p1, align 32 - %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer - %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer - %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> - ret <4 x double> %blend -} - -define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) { -; AVX1OR2-LABEL: blend_broadcasts_v2f64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: blend_broadcasts_v2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2] -; AVX512VL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 -; AVX512VL-NEXT: retq - %ld0 = load <2 x double>, ptr %p0, align 32 - %ld1 = load <2 x double>, ptr %p1, align 32 - %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> - ret <4 x double> %blend -} - define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { ; ALL-LABEL: blend_broadcasts_v1f64: ; ALL: # %bb.0: From c4c99d8bf3b19bc3ec123f915f3a408daee717f9 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Sat, 26 Apr 2025 17:19:21 +0100 Subject: [PATCH 08/14] Address comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c01bf885f5ff3..11454f5a56112 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8793,29 +8793,28 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, MVT VT = BVOp->getSimpleValueType(0u); auto const NumElems = VT.getVectorNumElements(); - if (Subtarget.hasAVX() && VT == MVT::v4f64) { + if (VT == MVT::v4f64) { // Collect unique operands. auto UniqueOps = SmallSet(); - for (auto &Op : BVOp->ops()) { - if (isIntOrFPConstant(Op) || Op.get()->isUndef()) - return {}; + for (SDValue Op : BVOp->ops()) { + if (isIntOrFPConstant(Op) || Op.isUndef()) + return SDValue(); UniqueOps.insert(Op); } // Candidate BUILD_VECTOR must have 2 unique operands. if (UniqueOps.size() != 2u) - return {}; + return SDValue(); // Create shuffle mask. - auto Op0 = BVOp->getOperand(0u); - auto Mask = std::vector(); - Mask.reserve(NumElems); + SDValue Op0 = BVOp->getOperand(0u); + SmallVector Mask(NumElems); for (auto I = 0u; I < NumElems; ++I) { - auto &Op = BVOp->getOperand(I); - Mask.push_back(Op == Op0 ? I : I + NumElems); + SDValue Op = BVOp->getOperand(I); + Mask[I] = Op == Op0 ? I : I + NumElems; } // Create shuffle of splats. - UniqueOps.erase(Op0); - auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0); - auto NewOp1 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin()); + + SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin()); + SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, *(++UniqueOps.begin())); return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask); } From 367282311efc7890ca0b92d7486cc2b5179b1bcb Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Sat, 26 Apr 2025 17:25:06 +0100 Subject: [PATCH 09/14] Address comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 11454f5a56112..51d034f7c82e9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8805,16 +8805,16 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, if (UniqueOps.size() != 2u) return SDValue(); // Create shuffle mask. - SDValue Op0 = BVOp->getOperand(0u); + SDValue Op0 = *(UniqueOps.begin()); + SDValue Op1 = *(++UniqueOps.begin()); SmallVector Mask(NumElems); for (auto I = 0u; I < NumElems; ++I) { SDValue Op = BVOp->getOperand(I); Mask[I] = Op == Op0 ? I : I + NumElems; } // Create shuffle of splats. - - SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin()); - SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, *(++UniqueOps.begin())); + SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0); + SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1); return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask); } From 0eee2980b185c7c0d9610e96fe46692f7e7cceeb Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Sat, 26 Apr 2025 19:15:14 +0100 Subject: [PATCH 10/14] Update tests. --- llvm/test/CodeGen/X86/build-vector-512.ll | 74 +++++++++++++++++++++-- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll index 33493f43fd134..789196c5e4848 100644 --- a/llvm/test/CodeGen/X86/build-vector-512.ll +++ b/llvm/test/CodeGen/X86/build-vector-512.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-32,AVX512F-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-64,AVX512F-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512BW-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512BW-64 define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7) { ; AVX-32-LABEL: test_buildvector_v8f64: @@ -480,6 +480,37 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; build vectors of repeated elements define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) { +; AVX512F-32-LABEL: test_buildvector_8f64_2_var: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 +; AVX512F-32-NEXT: movb $-126, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1} +; AVX512F-32-NEXT: retl +; +; AVX512F-64-LABEL: test_buildvector_8f64_2_var: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512F-64-NEXT: movb $-126, %al +; AVX512F-64-NEXT: kmovw %eax, %k1 +; AVX512F-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; AVX512F-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_buildvector_8f64_2_var: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 +; AVX512BW-32-NEXT: movb $-126, %al +; AVX512BW-32-NEXT: kmovd %eax, %k1 +; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1} +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_buildvector_8f64_2_var: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vbroadcastsd %xmm0, %zmm0 +; AVX512BW-64-NEXT: movb $-126, %al +; AVX512BW-64-NEXT: kmovd %eax, %k1 +; AVX512BW-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; AVX512BW-64-NEXT: retq %v0 = insertelement <8 x double> poison, double %a0, i32 0 %v1 = insertelement <8 x double> %v0, double %a1, i32 1 %v2 = insertelement <8 x double> %v1, double %a0, i32 2 @@ -492,6 +523,41 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) { } define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) { +; AVX512F-32-LABEL: test_buildvector_8f64_2_load: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512F-32-NEXT: vbroadcastsd (%ecx), %zmm0 +; AVX512F-32-NEXT: movb $-126, %cl +; AVX512F-32-NEXT: kmovw %ecx, %k1 +; AVX512F-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1} +; AVX512F-32-NEXT: retl +; +; AVX512F-64-LABEL: test_buildvector_8f64_2_load: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcastsd (%rdi), %zmm0 +; AVX512F-64-NEXT: movb $-126, %al +; AVX512F-64-NEXT: kmovw %eax, %k1 +; AVX512F-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1} +; AVX512F-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_buildvector_8f64_2_load: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX512BW-32-NEXT: vbroadcastsd (%ecx), %zmm0 +; AVX512BW-32-NEXT: movb $-126, %cl +; AVX512BW-32-NEXT: kmovd %ecx, %k1 +; AVX512BW-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1} +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_buildvector_8f64_2_load: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vbroadcastsd (%rdi), %zmm0 +; AVX512BW-64-NEXT: movb $-126, %al +; AVX512BW-64-NEXT: kmovd %eax, %k1 +; AVX512BW-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1} +; AVX512BW-64-NEXT: retq %a0 = load double, ptr %p0 %a1 = load double, ptr %p1 %v0 = insertelement <8 x double> poison, double %a0, i32 0 From 1f4ada691fe9d5488eee91a1d178776de39ded96 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Sat, 26 Apr 2025 19:53:55 +0100 Subject: [PATCH 11/14] Address comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 37 ++++---- llvm/test/CodeGen/X86/build-vector-256.ll | 76 +++++++++++------ .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 84 +++++++++++++++---- 3 files changed, 140 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 51d034f7c82e9..925313e72cdf8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8785,28 +8785,37 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, return LowerShift(Res, Subtarget, DAG); } +static bool isShuffleFoldableLoad(SDValue); + /// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats /// representing a blend. static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG) { MVT VT = BVOp->getSimpleValueType(0u); - auto const NumElems = VT.getVectorNumElements(); - if (VT == MVT::v4f64) { - // Collect unique operands. - auto UniqueOps = SmallSet(); - for (SDValue Op : BVOp->ops()) { - if (isIntOrFPConstant(Op) || Op.isUndef()) - return SDValue(); - UniqueOps.insert(Op); - } - // Candidate BUILD_VECTOR must have 2 unique operands. - if (UniqueOps.size() != 2u) + if (VT != MVT::v4f64) + return SDValue(); + + // Collect unique operands. + auto UniqueOps = SmallSet(); + for (SDValue Op : BVOp->ops()) { + if (isIntOrFPConstant(Op) || Op.isUndef()) return SDValue(); + UniqueOps.insert(Op); + } + + // Candidate BUILD_VECTOR must have 2 unique operands. + if (UniqueOps.size() != 2u) + return SDValue(); + + SDValue Op0 = *(UniqueOps.begin()); + SDValue Op1 = *(++UniqueOps.begin()); + + if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) || + Subtarget.hasAVX2()) { // Create shuffle mask. - SDValue Op0 = *(UniqueOps.begin()); - SDValue Op1 = *(++UniqueOps.begin()); + auto const NumElems = VT.getVectorNumElements(); SmallVector Mask(NumElems); for (auto I = 0u; I < NumElems; ++I) { SDValue Op = BVOp->getOperand(I); @@ -8818,7 +8827,7 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask); } - return {}; + return SDValue(); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll index ed00cfe4c32f1..3edb712e53c8d 100644 --- a/llvm/test/CodeGen/X86/build-vector-256.ll +++ b/llvm/test/CodeGen/X86/build-vector-256.ll @@ -415,22 +415,28 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; build vectors of repeated elements define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) { -; AVX-32-LABEL: test_buildvector_4f64_2_var: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 -; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1 -; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_4f64_2_var: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: test_buildvector_4f64_2_var: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: test_buildvector_4f64_2_var: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1 +; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: test_buildvector_4f64_2_var: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vbroadcastsd %xmm1, %ymm1 @@ -445,21 +451,41 @@ define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) { } define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) { -; AVX-32-LABEL: test_buildvector_4f64_2_load: -; AVX-32: # %bb.0: -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vbroadcastsd (%ecx), %ymm0 -; AVX-32-NEXT: vbroadcastsd (%eax), %ymm1 -; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_4f64_2_load: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_4f64_2_load: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastsd (%rsi), %ymm0 -; AVX-64-NEXT: vbroadcastsd (%rdi), %ymm1 -; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX-64-NEXT: retq +; AVX1-64-LABEL: test_buildvector_4f64_2_load: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_4f64_2_load: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX2-32-NEXT: vbroadcastsd (%ecx), %ymm0 +; AVX2-32-NEXT: vbroadcastsd (%eax), %ymm1 +; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_4f64_2_load: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-64-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-64-NEXT: retq %a0 = load double, ptr %p0 %a1 = load double, ptr %p1 %v0 = insertelement <4 x double> poison, double %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index a746f3528a050..4cdc65e5c1b97 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2368,12 +2368,28 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) { } define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { -; ALL-LABEL: blend_broadcasts_v1f64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rsi), %ymm0 -; ALL-NEXT: vbroadcastsd (%rdi), %ymm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; ALL-NEXT: retq +; AVX1-LABEL: blend_broadcasts_v1f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_broadcasts_v1f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: blend_broadcasts_v1f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> @@ -2381,12 +2397,28 @@ define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) { } define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { -; ALL-LABEL: blend_broadcasts_v1f64_4x: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rsi), %ymm0 -; ALL-NEXT: vbroadcastsd (%rdi), %ymm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; ALL-NEXT: retq +; AVX1-LABEL: blend_broadcasts_v1f64_4x: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_broadcasts_v1f64_4x: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: blend_broadcasts_v1f64_4x: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer @@ -2396,12 +2428,28 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) { } define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) { -; ALL-LABEL: blend_broadcasts_v1f64_2x: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rsi), %ymm0 -; ALL-NEXT: vbroadcastsd (%rdi), %ymm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; ALL-NEXT: retq +; AVX1-LABEL: blend_broadcasts_v1f64_2x: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_broadcasts_v1f64_2x: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: blend_broadcasts_v1f64_2x: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0 +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq %ld0 = load <1 x double>, ptr %p0, align 32 %ld1 = load <1 x double>, ptr %p1, align 32 %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer From ec2044abb5d18f7b61fb448e03f603d9a7c80e0d Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Sat, 26 Apr 2025 20:08:07 +0100 Subject: [PATCH 12/14] Formatting. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 925313e72cdf8..77f3c070be7b4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8808,11 +8808,11 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, // Candidate BUILD_VECTOR must have 2 unique operands. if (UniqueOps.size() != 2u) return SDValue(); - + SDValue Op0 = *(UniqueOps.begin()); SDValue Op1 = *(++UniqueOps.begin()); - if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) || + if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) || Subtarget.hasAVX2()) { // Create shuffle mask. auto const NumElems = VT.getVectorNumElements(); From 16fe22d08a9bcc0a655040e9e3ba23078c9927b2 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Mon, 28 Apr 2025 02:47:22 +0100 Subject: [PATCH 13/14] Address comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 77f3c070be7b4..9f435273444d7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8809,8 +8809,9 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, if (UniqueOps.size() != 2u) return SDValue(); - SDValue Op0 = *(UniqueOps.begin()); - SDValue Op1 = *(++UniqueOps.begin()); + SDValue Op0 = BVOp->getOperand(0u); + UniqueOps.erase(Op0); + SDValue Op1 = *UniqueOps.begin(); if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) || Subtarget.hasAVX2()) { From 02345ed6fe66e22471bc7a3ff56a64fed618bdbd Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 6 May 2025 10:56:40 +0100 Subject: [PATCH 14/14] Address comments. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9f435273444d7..f04603867a587 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8813,8 +8813,8 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, UniqueOps.erase(Op0); SDValue Op1 = *UniqueOps.begin(); - if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) || - Subtarget.hasAVX2()) { + if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) || + isShuffleFoldableLoad(Op1)) { // Create shuffle mask. auto const NumElems = VT.getVectorNumElements(); SmallVector Mask(NumElems);