From 0e7054e1b6e0769408de14eef746ac8f0b4434a9 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Mon, 21 Jul 2025 12:02:58 -0700 Subject: [PATCH] [RISCV][TTI] Enable masked interleave vectorization Nww that the masked.load/store support has landed for interleave access, we can let the loop vectorizer generate masked interleave groups for the supported cases. At the moment, only scalable VFs are supported via the lowering; that restriction will be removed once pending work on the shuffle vector path has all landed. A couple things to highlight in terms of codegen. First, this improves vectorizations of non-tail folded loops when the loop contains internal predication via control flow. This should only kick in when all elements of the interleave group are predicated together as we still don't support using masking for gaps in the interleave group. (By support, I mean support efficiently; if it gets generated by some quirk of the cost model, we should still lower it correctly.) Second, this enables our ability to tail fold loops with interleave groups. Without this change, any tail folded loop is going to be forced to use NF independent strided loads or stores. This closes a moderate large performance gap with the non-tail folded version when the interleave group is otherwise unconditional. Third, for the EVL predication cases, we are not EVL predicating the masked interleaves. Instead, we're hitting the codegen for masked predication. This is still a huge improvement over the prior codegen, but is also quite sub-optimal. Rewriting interleave groups for EVL is a separate project in the vectorizer. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 12 +- .../Target/RISCV/RISCVTargetTransformInfo.h | 4 + .../RISCV/interleaved-masked-access.ll | 278 ++++++++---------- ...ectorize-force-tail-with-evl-interleave.ll | 41 ++- 4 files changed, 157 insertions(+), 178 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 56ead92187b04..1ae14b5e565c3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -979,12 +979,14 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) const { - // The interleaved memory access pass will lower interleaved memory ops (i.e - // a load and store followed by a specific shuffle) to vlseg/vsseg - // intrinsics. - if (!UseMaskForCond && !UseMaskForGaps && + auto *VTy = cast(VecTy); + + // The interleaved memory access pass will lower (de)interleave ops combined + // with an adjacent appropriate memory to vlseg/vsseg intrinsics. We + // currently only support masking for the scalable path. vlseg/vsseg only + // support masking per-iteration (i.e. condition), not per-segment (i.e. gap). + if ((VTy->isScalableTy() || !UseMaskForCond) && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { - auto *VTy = cast(VecTy); std::pair LT = getTypeLegalizationCost(VTy); // Need to make sure type has't been scalarized if (LT.second.isVector()) { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 12bf8c1b4de70..24b148619c771 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -398,6 +398,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool enableInterleavedAccessVectorization() const override { return true; } + bool enableMaskedInterleavedAccessVectorization() const override { + return ST->hasVInstructions(); + } + unsigned getMinTripCountTailFoldingThreshold() const override; enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index 45357dd6bf0d6..131ccca2fe298 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -30,26 +30,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -80,26 +79,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -129,28 +127,29 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) ; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP9]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP12]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP13]], align 1 [[TMP15]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP16]], align 1 [[TMP18]], [[TMP6]], i32 [[TMP5]]) +; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ult [[VEC_IND]], splat (i32 1024) +; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = select [[TMP6]], [[TMP7]], zeroinitializer +; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = shl i32 [[EVL_BASED_IV]], 1 +; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]] +; PREDICATED_EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP8]], [[TMP8]]) +; PREDICATED_EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP11]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP15]] +; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP14]] +; PREDICATED_EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP14]], [[TMP17]]) +; PREDICATED_EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP8]], [[TMP8]]) +; PREDICATED_EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP16]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] ; PREDICATED_EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_EVL: middle.block: ; PREDICATED_EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_EVL: scalar.ph: @@ -215,42 +214,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; SCALAR_EPILOGUE-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; SCALAR_EPILOGUE-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; SCALAR_EPILOGUE-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -281,42 +267,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -346,44 +319,33 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) ; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP12]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP16]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP18]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_EVL-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; PREDICATED_EVL-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_EVL-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; PREDICATED_EVL-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP19]], align 1 [[TMP24]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP20]], align 1 [[TMP26]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_EVL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP21]], align 1 [[TMP28]], [[TMP6]], i32 [[TMP5]]) -; PREDICATED_EVL-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_EVL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP22]], align 1 [[TMP30]], [[TMP6]], i32 [[TMP5]]) +; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ult [[VEC_IND]], splat (i32 1024) +; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = select [[TMP6]], [[TMP7]], zeroinitializer +; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = shl i32 [[EVL_BASED_IV]], 2 +; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]] +; PREDICATED_EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP8]], [[TMP8]], [[TMP8]], [[TMP8]]) +; PREDICATED_EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP11]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = call @llvm.smax.nxv16i8( [[TMP14]], [[TMP15]]) +; PREDICATED_EVL-NEXT: [[TMP19:%.*]] = sub zeroinitializer, [[TMP18]] +; PREDICATED_EVL-NEXT: [[TMP20:%.*]] = sext i32 [[TMP9]] to i64 +; PREDICATED_EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP20]] +; PREDICATED_EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]]) +; PREDICATED_EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP8]], [[TMP8]], [[TMP8]], [[TMP8]]) +; PREDICATED_EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]] ; PREDICATED_EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_EVL-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDICATED_EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; PREDICATED_EVL: middle.block: ; PREDICATED_EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index fe6a693e83816..56df2393b3f5d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -27,26 +27,38 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = mul [[TMP10]], splat (i64 1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP12]] ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; IF-EVL-NEXT: [[TMP15:%.*]] = add i64 [[TMP8]], 0 +; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 1 +; IF-EVL-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], [[TMP16]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[TMP22:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP10:%.*]] = add zeroinitializer, [[TMP22]] +; IF-EVL-NEXT: [[VEC_IND:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP10]] +; IF-EVL-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; IF-EVL-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 +; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 +; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP14:%.*]] = add [[DOTSPLAT]], [[TMP13]] +; IF-EVL-NEXT: [[STEP_ADD:%.*]] = add [[BROADCAST_SPLAT2]], [[TMP14]] ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp ule [[STEP_ADD]], [[BROADCAST_SPLAT]] -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[STEP_ADD]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP21]], i32 4, [[TMP19]], poison) -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP22]], i32 4, [[TMP20]], poison) -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[VEC_IND]], i32 1 -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[STEP_ADD]], i32 1 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP23]], i32 4, [[TMP19]], poison) -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP24]], i32 4, [[TMP20]], poison) +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[INDEX]], i32 0 +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP21]], i32 0 +; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP19]], [[TMP19]]) +; IF-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK]], poison) +; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; IF-EVL-NEXT: [[INTERLEAVED_MASK6:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP20]], [[TMP20]]) +; IF-EVL-NEXT: [[WIDE_MASKED_VEC7:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP18]], i32 4, [[INTERLEAVED_MASK6]], poison) +; IF-EVL-NEXT: [[STRIDED_VEC8:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC7]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = extractvalue { , } [[STRIDED_VEC8]], 0 +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = extractvalue { , } [[STRIDED_VEC8]], 1 ; IF-EVL-NEXT: [[TMP25:%.*]] = add nsw [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER]] ; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]] ; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] @@ -57,7 +69,6 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP25]], ptr [[TMP29]], i32 4, [[TMP19]]) ; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP26]], ptr [[TMP32]], i32 4, [[TMP20]]) ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: