diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 0d5eb86bf899c..67f924aadc8c0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -979,11 +979,11 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) const { - // The interleaved memory access pass will lower interleaved memory ops (i.e - // a load and store followed by a specific shuffle) to vlseg/vsseg - // intrinsics. - if (!UseMaskForCond && !UseMaskForGaps && - Factor <= TLI->getMaxSupportedInterleaveFactor()) { + // The interleaved memory access pass will lower (de)interleave ops combined + // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg + // only support masking per-iteration (i.e. condition), not per-segment (i.e. + // gap). + if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { auto *VTy = cast(VecTy); std::pair LT = getTypeLegalizationCost(VTy); // Need to make sure type has't been scalarized diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99cf31899..05d504cbcb6bb 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -398,6 +398,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool enableInterleavedAccessVectorization() const override { return true; } + bool enableMaskedInterleavedAccessVectorization() const override { + return ST->hasVInstructions(); + } + unsigned getMinTripCountTailFoldingThreshold() const override; enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index 6f20376e08d85..976ce77d2ba29 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -30,26 +30,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -80,26 +79,25 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP8]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP9]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP13]], [[TMP15]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP17]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP16]], [[TMP18]], i32 1, [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP13]] +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP12]], [[TMP15]]) +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA: middle.block: ; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA: scalar.ph: @@ -119,25 +117,29 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = zext nneg [[TMP3]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP4]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP5]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = or disjoint [[TMP3]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP7]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP8]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = zext nneg [[TMP3]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP10]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP9]], align 1 [[TMP11]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = sub zeroinitializer, [[TMP9]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP12]], align 1 [[TMP14]], [[TMP2]], i32 [[TMP1]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = sext i32 [[TMP6]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP11]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP11]], [[TMP14]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP13]], i32 1, [[INTERLEAVED_MASK5]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] @@ -207,42 +209,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; SCALAR_EPILOGUE-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; SCALAR_EPILOGUE-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; SCALAR_EPILOGUE-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; SCALAR_EPILOGUE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; SCALAR_EPILOGUE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; SCALAR_EPILOGUE-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; SCALAR_EPILOGUE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_EPILOGUE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; SCALAR_EPILOGUE-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: ; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -273,42 +262,29 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024) ; PREDICATED_DATA-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] ; PREDICATED_DATA-NEXT: [[TMP6:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP5]], zeroinitializer -; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = or disjoint [[TMP7]], splat (i32 2) -; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = or disjoint [[TMP7]], splat (i32 3) -; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP12]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP14]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP15]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP16]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP17]] -; PREDICATED_DATA-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP18]], i32 1, [[TMP6]], poison) -; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] -; PREDICATED_DATA-NEXT: [[TMP21:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_DATA-NEXT: [[TMP22:%.*]] = sub zeroinitializer, [[TMP21]] -; PREDICATED_DATA-NEXT: [[TMP23:%.*]] = zext nneg [[TMP7]] to -; PREDICATED_DATA-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP24]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP25:%.*]] = zext nneg [[TMP8]] to -; PREDICATED_DATA-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP26]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP27:%.*]] = zext nneg [[TMP9]] to -; PREDICATED_DATA-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP27]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP28]], i32 1, [[TMP6]]) -; PREDICATED_DATA-NEXT: [[TMP29:%.*]] = zext nneg [[TMP10]] to -; PREDICATED_DATA-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP29]] -; PREDICATED_DATA-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP30]], i32 1, [[TMP6]]) +; PREDICATED_DATA-NEXT: [[TMP7:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_DATA-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP9]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[TMP10]], [[TMP11]]) +; PREDICATED_DATA-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP14]] +; PREDICATED_DATA-NEXT: [[TMP16:%.*]] = call @llvm.smax.nxv16i8( [[TMP12]], [[TMP13]]) +; PREDICATED_DATA-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; PREDICATED_DATA-NEXT: [[TMP18:%.*]] = sext i32 [[TMP7]] to i64 +; PREDICATED_DATA-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP18]] +; PREDICATED_DATA-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) +; PREDICATED_DATA-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP6]], [[TMP6]], [[TMP6]], [[TMP6]]) +; PREDICATED_DATA-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_DATA-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; PREDICATED_DATA-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; PREDICATED_DATA-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_DATA-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; PREDICATED_DATA-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_DATA: middle.block: ; PREDICATED_DATA-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA: scalar.ph: @@ -328,46 +304,38 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = or disjoint [[TMP3]], splat (i32 1) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = or disjoint [[TMP3]], splat (i32 2) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = or disjoint [[TMP3]], splat (i32 3) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = zext nneg [[TMP3]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP7]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP8]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = zext nneg [[TMP4]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP9]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP10]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = zext nneg [[TMP5]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP11]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP12]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], zeroinitializer +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP15]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = zext nneg [[TMP3]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP19]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP15]], align 1 [[TMP20]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP21:%.*]] = zext nneg [[TMP4]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP21]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP16]], align 1 [[TMP22]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP23:%.*]] = zext nneg [[TMP5]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP23]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP17]], align 1 [[TMP24]], [[TMP2]], i32 [[TMP1]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP25:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP25]] -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0( [[TMP18]], align 1 [[TMP26]], [[TMP2]], i32 [[TMP1]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = sext i32 [[TMP6]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP17]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK5]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index 1f7c51800f3e0..8d987a94d383d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -15,38 +15,35 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]] +; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP5]], 1 +; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP12:%.*]] = mul [[TMP10]], splat (i64 1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP12]] ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], [[VEC_IND]], i32 0 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP21]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], [[VEC_IND]], i32 1 -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP23]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]] -; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP26]], ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv4i32() +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0 +; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP17]], [[TMP17]]) +; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP6]], i32 4, [[INTERLEAVED_MASK]], poison) +; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; IF-EVL-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw [[TMP15]], [[TMP14]] +; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP9]], ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP11]]) +; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] +; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: @@ -55,10 +52,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0 -; IF-EVL-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1 -; IF-EVL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] +; IF-EVL-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] ; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1