-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AArch64][SME] Lower aarch64.sme.cnts* to vscale when in streaming mode #154305
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
In streaming mode, both the @llvm.aarch64.sme.cnts and @llvm.aarch64.sve.cnt intrinsics are equivalent. For SVE, cnt* is lowered in instCombineIntrinsic to @llvm.sme.vscale(). This patch lowers the SME intrinsic similarly when in streaming-mode.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Kerry McLaughlin (kmclaughlin-arm) ChangesIn streaming mode, both the @llvm.aarch64.sme.cnts and @llvm.aarch64.sve.cnt Patch is 82.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154305.diff 4 Files Affected:
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
index fb86690f07f1d..e4c93ade35d53 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ld1_vnum.c
@@ -9,29 +9,31 @@
// CHECK-C-LABEL: define dso_local void @test_svld1_hor_vnum_za8(
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CHECK-C-NEXT: entry:
-// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]]
-// CHECK-C-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-C-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
// CHECK-C-NEXT: ret void
//
// CHECK-CXX-LABEL: define dso_local void @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl(
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// CHECK-CXX-NEXT: entry:
-// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM]]
-// CHECK-CXX-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT: [[TMP2:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT: [[TMP3:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP3]])
-// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP2]]
-// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[ADD]], 15
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[TMP0]]
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
+// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 15
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP2]], i32 0, i32 [[TMP5]])
// CHECK-CXX-NEXT: ret void
//
void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -43,30 +45,32 @@ void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr,
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-C-NEXT: entry:
// CHECK-C-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
-// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
+// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
// CHECK-C-NEXT: ret void
//
// CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKvl(
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-CXX-NEXT: entry:
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG]])
-// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 7
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i32 1, i32 [[TMP5]])
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 7
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP3]], i32 1, i32 [[TMP6]])
// CHECK-CXX-NEXT: ret void
//
void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -78,30 +82,32 @@ void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr,
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-C-NEXT: entry:
// CHECK-C-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
-// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
+// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
// CHECK-C-NEXT: ret void
//
// CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKvl(
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-CXX-NEXT: entry:
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG]])
-// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 3
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i32 3, i32 [[TMP5]])
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 3
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP3]], i32 3, i32 [[TMP6]])
// CHECK-CXX-NEXT: ret void
//
void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -113,30 +119,32 @@ void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr,
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-C-NEXT: entry:
// CHECK-C-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
-// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
+// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-C-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
// CHECK-C-NEXT: ret void
//
// CHECK-CXX-LABEL: define dso_local void @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKvl(
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-CXX-NEXT: entry:
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG]])
-// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[ADD]], 1
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i32 7, i32 [[TMP5]])
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT: [[ADD:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-CXX-NEXT: [[TMP6:%.*]] = add i32 [[ADD]], 1
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP3]], i32 7, i32 [[TMP6]])
// CHECK-CXX-NEXT: ret void
//
void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -148,26 +156,28 @@ void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr,
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-C-NEXT: entry:
// CHECK-C-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
-// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-C-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-C-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-C-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
+// CHECK-C-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-C-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-C-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-C-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-C-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
// CHECK-C-NEXT: ret void
//
// CHECK-CXX-LABEL: define dso_local void @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKvl(
// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-CXX-NEXT: entry:
// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG]])
-// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM]]
-// CHECK-CXX-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
-// CHECK-CXX-NEXT: [[TMP3:%.*]] = trunc i64 [[VNUM]] to i32
-// CHECK-CXX-NEXT: [[TMP4:%.*]] = add i32 [[SLICE_BASE]], [[TMP3]]
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 0, i32 [[TMP4]])
-// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i32 15, i32 [[TMP4]])
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = shl i64 [[VNUM]], 4
+// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[TMP2]], [[TMP1]]
+// CHECK-CXX-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]]
+// CHECK-CXX-NEXT: [[TMP4:%.*]] = trunc i64 [[VNUM]] to i32
+// CHECK-CXX-NEXT: [[TMP5:%.*]] = add i32 [[SLICE_BASE]], [[TMP4]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 0, i32 [[TMP5]])
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP3]], i32 15, i32 [[TMP5]])
// CHECK-CXX-NEXT: ret void
//
void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
@@ -178,29 +188,31 @@ void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr
// CHECK-C-LABEL: define dso_local void @test_svld1_ver_hor_za8(
// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
// CHECK-C-NEXT: entry:
-// CHECK-C-NEXT: [...
[truncated]
|
@@ -0,0 +1,67 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py | |||
; RUN: opt -mattr=+sme -S -passes=instcombine < %s | FileCheck %s -check-prefix=NON-STREAMING | |||
; RUN: opt -mattr=+sme -S -passes=instcombine -force-streaming < %s | FileCheck %s -check-prefix=STREAMING |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mind adding a RUN line for the streaming-compatible use case as well?
In streaming mode, both the @llvm.aarch64.sme.cnts and @llvm.aarch64.sve.cnt
intrinsics are equivalent. For SVE, cnt* is lowered in instCombineIntrinsic
to @llvm.sme.vscale(). This patch lowers the SME intrinsic similarly when
in streaming-mode.