diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c index fb86690f07f1d..e4c93ade35d53 100644 --- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c +++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c @@ -9,29 +9,31 @@ // CHECK-C-LABEL: define dso_local void @test_svld1_hor_vnum_za8( // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-C-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -43,30 +45,32 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -78,30 +82,32 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -113,30 +119,32 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -148,26 +156,28 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -178,29 +188,31 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr // CHECK-C-LABEL: define dso_local void @test_svld1_ver_hor_za8( // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-C-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z22test_svld1_ver_hor_za8ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -212,30 +224,32 @@ void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, i // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_ver_vnum_za16ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -247,30 +261,32 @@ void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_ver_vnum_za32ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -282,30 +298,32 @@ void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_ver_vnum_za64ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { @@ -317,26 +335,28 @@ void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z25test_svld1_ver_vnum_za128ju10__SVBool_tPKvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") { diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c index dafc3d61a05f1..22a0b9eabaea3 100644 --- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c +++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_st1_vnum.c @@ -9,29 +9,31 @@ // CHECK-C-LABEL: define dso_local void @test_svst1_hor_vnum_za8( // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-C-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -43,30 +45,32 @@ void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_ // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_hor_vnum_za16ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -78,30 +82,32 @@ void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_hor_vnum_za32ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -113,30 +119,32 @@ void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_hor_vnum_za64ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -148,26 +156,28 @@ void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z25test_svst1_hor_vnum_za128ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -178,29 +188,31 @@ void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int6 // CHECK-C-LABEL: define dso_local void @test_svst1_ver_vnum_za8( // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-C-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z23test_svst1_ver_vnum_za8ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]] -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]] +// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1b.vert( [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -212,30 +224,32 @@ void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_ // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_ver_vnum_za16ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1h.vert( [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -247,30 +261,32 @@ void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_ver_vnum_za32ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1w.vert( [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -282,30 +298,32 @@ void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z24test_svst1_ver_vnum_za64ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1 +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1d.vert( [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]]) // CHECK-CXX-NEXT: ret void // void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { @@ -317,26 +335,28 @@ void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: // CHECK-C-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-C-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z25test_svst1_ver_vnum_za128ju10__SVBool_tPvl( // CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: // CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv1i1( [[PG]]) -// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]]) -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4 +// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]] +// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.st1q.vert( [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]]) // CHECK-CXX-NEXT: ret void // void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 17f0028e43fc3..70c8ec023fcd9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2094,6 +2094,20 @@ instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { : std::nullopt; } +static std::optional +instCombineSMECntsElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts, + const AArch64Subtarget *ST) { + if (!ST->isStreaming()) + return std::nullopt; + + // In streaming-mode, aarch64_sme_cnts is equivalent to aarch64_sve_cnt + // with SVEPredPattern::all + Value *Cnt = IC.Builder.CreateElementCount( + II.getType(), ElementCount::getScalable(NumElts)); + Cnt->takeName(&II); + return IC.replaceInstUsesWith(II, Cnt); +} + static std::optional instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II) { Value *PgVal = II.getArgOperand(0); @@ -2803,6 +2817,14 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVECntElts(IC, II, 8); case Intrinsic::aarch64_sve_cntb: return instCombineSVECntElts(IC, II, 16); + case Intrinsic::aarch64_sme_cntsd: + return instCombineSMECntsElts(IC, II, 2, ST); + case Intrinsic::aarch64_sme_cntsw: + return instCombineSMECntsElts(IC, II, 4, ST); + case Intrinsic::aarch64_sme_cntsh: + return instCombineSMECntsElts(IC, II, 8, ST); + case Intrinsic::aarch64_sme_cntsb: + return instCombineSMECntsElts(IC, II, 16, ST); case Intrinsic::aarch64_sve_ptest_any: case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: diff --git a/llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll b/llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll new file mode 100644 index 0000000000000..f213c0b53f6ef --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mattr=+sme -S -passes=instcombine < %s | FileCheck %s +; RUN: opt -mattr=+sme -S -passes=instcombine -force-streaming-compatible < %s | FileCheck %s +; RUN: opt -mattr=+sme -S -passes=instcombine -force-streaming < %s | FileCheck %s -check-prefix=CHECK-STREAMING + +target triple = "aarch64-unknown-linux-gnu" + +define i64 @cntsb() { +; CHECK-LABEL: @cntsb( +; CHECK-NEXT: [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsb() +; CHECK-NEXT: ret i64 [[OUT]] +; +; CHECK-STREAMING-LABEL: @cntsb( +; CHECK-STREAMING-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-STREAMING-NEXT: [[OUT:%.*]] = shl nuw i64 [[TMP1]], 4 +; CHECK-STREAMING-NEXT: ret i64 [[OUT]] +; + %out = call i64 @llvm.aarch64.sme.cntsb() + ret i64 %out +} + +define i64 @cntsh() { +; CHECK-LABEL: @cntsh( +; CHECK-NEXT: [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsh() +; CHECK-NEXT: ret i64 [[OUT]] +; +; CHECK-STREAMING-LABEL: @cntsh( +; CHECK-STREAMING-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-STREAMING-NEXT: [[OUT:%.*]] = shl nuw i64 [[TMP1]], 3 +; CHECK-STREAMING-NEXT: ret i64 [[OUT]] +; + %out = call i64 @llvm.aarch64.sme.cntsh() + ret i64 %out +} + +define i64 @cntsw() { +; CHECK-LABEL: @cntsw( +; CHECK-NEXT: [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsw() +; CHECK-NEXT: ret i64 [[OUT]] +; +; CHECK-STREAMING-LABEL: @cntsw( +; CHECK-STREAMING-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-STREAMING-NEXT: [[OUT:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-STREAMING-NEXT: ret i64 [[OUT]] +; + %out = call i64 @llvm.aarch64.sme.cntsw() + ret i64 %out +} + +define i64 @cntsd() { +; CHECK-LABEL: @cntsd( +; CHECK-NEXT: [[OUT:%.*]] = call i64 @llvm.aarch64.sme.cntsd() +; CHECK-NEXT: ret i64 [[OUT]] +; +; CHECK-STREAMING-LABEL: @cntsd( +; CHECK-STREAMING-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-STREAMING-NEXT: [[OUT:%.*]] = shl nuw i64 [[TMP1]], 1 +; CHECK-STREAMING-NEXT: ret i64 [[OUT]] +; + %out = call i64 @llvm.aarch64.sme.cntsd() + ret i64 %out +} + +declare i64 @llvm.aarch64.sve.cntsb() +declare i64 @llvm.aarch64.sve.cntsh() +declare i64 @llvm.aarch64.sve.cntsw() +declare i64 @llvm.aarch64.sve.cntsd() +