From 936aa775554b0f1de8d491818f785f747069d723 Mon Sep 17 00:00:00 2001 From: Pete Chou Date: Thu, 26 Jun 2025 15:45:00 -0700 Subject: [PATCH 1/2] [GlobalISel] Allow Legalizer to lower volatile memcpy family. This change updates legalizer to allow lowering volatile memcpy family as a target might rely on lowering to legalize them. Also, legalizer already lowers volatile G_MEMCPY_INLINE and has the capability to lower memcpy family. For targets like aarch64 use legalizer to lower memcpy family as a combiner optimization, the change adds an additional argument for lowering to skip volatile and keep the existing behavior in that case. --- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 3 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 +- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 7 +++-- .../AMDGPU/GlobalISel/legalize-memcpy.mir | 30 +++++++++++++++++++ .../GlobalISel/legalize-memcpyinline.mir | 30 +++++++++++++++++++ .../AMDGPU/GlobalISel/legalize-memmove.mir | 30 +++++++++++++++++++ .../AMDGPU/GlobalISel/legalize-memset.mir | 29 ++++++++++++++++++ 7 files changed, 126 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index ea0873f41ebba..be8169c79f219 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -489,7 +489,8 @@ class LegalizerHelper { LLVM_ABI LegalizeResult lowerVectorReduction(MachineInstr &MI); LLVM_ABI LegalizeResult lowerMemcpyInline(MachineInstr &MI); LLVM_ABI LegalizeResult lowerMemCpyFamily(MachineInstr &MI, - unsigned MaxLen = 0); + unsigned MaxLen = 0, + bool SkipVolatile = false); LLVM_ABI LegalizeResult lowerVAArg(MachineInstr &MI); }; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b1e851183de0d..9e203ce863639 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1704,7 +1704,7 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder); - return Helper.lowerMemCpyFamily(MI, MaxLen) == + return Helper.lowerMemCpyFamily(MI, MaxLen, /*SkipVolatile=*/true) == LegalizerHelper::LegalizeResult::Legalized; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index b87b029d01632..b5af3c64d50fa 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -10066,7 +10066,8 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { +LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen, + bool SkipVolatile) { const unsigned Opc = MI.getOpcode(); // This combine is fairly complex so it's not written with a separate // matcher function. @@ -10099,8 +10100,8 @@ LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { } bool IsVolatile = MemOp->isVolatile(); - // Don't try to optimize volatile. - if (IsVolatile) + // Don't try to optimize volatile when not allowed. + if (SkipVolatile && IsVolatile) return UnableToLegalize; if (MaxLen && KnownLen > MaxLen) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir index be3fe91407fdf..4f5f52b25cdf7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpy_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpy_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir index a82ca30209820..0392aef6fe030 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpyinline_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpyinline_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir index e7cfaab135beb..1f8d1aac24ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memmove_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memmove_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir index 021cebbb6cb49..dda94e1550585 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir @@ -30,3 +30,32 @@ body: | S_ENDPGM 0 ... +--- +name: memset_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: memset_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8) + ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s16) = G_TRUNC %3:_(s32) + %5:_(s8) = G_TRUNC %4:_(s16) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (volatile store (s8)) + S_ENDPGM 0 + +... From 13f1a7bcd55ec1780d631eb1b906613aa82eb190 Mon Sep 17 00:00:00 2001 From: Pete Chou Date: Tue, 15 Jul 2025 09:48:59 -0700 Subject: [PATCH 2/2] address review comment --- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 3 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 2 +- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 9 +- llvm/test/CodeGen/AArch64/aarch64-mops.ll | 188 +++++++----------- 4 files changed, 71 insertions(+), 131 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index be8169c79f219..ea0873f41ebba 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -489,8 +489,7 @@ class LegalizerHelper { LLVM_ABI LegalizeResult lowerVectorReduction(MachineInstr &MI); LLVM_ABI LegalizeResult lowerMemcpyInline(MachineInstr &MI); LLVM_ABI LegalizeResult lowerMemCpyFamily(MachineInstr &MI, - unsigned MaxLen = 0, - bool SkipVolatile = false); + unsigned MaxLen = 0); LLVM_ABI LegalizeResult lowerVAArg(MachineInstr &MI); }; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 9e203ce863639..b1e851183de0d 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1704,7 +1704,7 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder); - return Helper.lowerMemCpyFamily(MI, MaxLen, /*SkipVolatile=*/true) == + return Helper.lowerMemCpyFamily(MI, MaxLen) == LegalizerHelper::LegalizeResult::Legalized; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index b5af3c64d50fa..a68f6542760cf 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -10066,8 +10066,7 @@ LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen, - bool SkipVolatile) { +LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { const unsigned Opc = MI.getOpcode(); // This combine is fairly complex so it's not written with a separate // matcher function. @@ -10099,14 +10098,10 @@ LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen, return Legalized; } - bool IsVolatile = MemOp->isVolatile(); - // Don't try to optimize volatile when not allowed. - if (SkipVolatile && IsVolatile) - return UnableToLegalize; - if (MaxLen && KnownLen > MaxLen) return UnableToLegalize; + bool IsVolatile = MemOp->isVolatile(); if (Opc == TargetOpcode::G_MEMCPY) { auto &MF = *MI.getParent()->getParent(); const auto &TLI = *MF.getSubtarget().getTargetLowering(); diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll index ff7872c922e32..83530049a50d6 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll @@ -87,46 +87,17 @@ entry: } define void @memset_10_zeroval_volatile(ptr %dst) { -; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_zeroval_volatile: -; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w1, wzr -; GISel-WITHOUT-MOPS-O0-NEXT: bl memset -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O0-NEXT: ret -; -; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_zeroval_volatile: -; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w1, wzr -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memset -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O3-NEXT: ret -; -; GISel-MOPS-O0-LABEL: memset_10_zeroval_volatile: -; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: mov x9, xzr -; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: ret +; GISel-WITHOUT-MOPS-LABEL: memset_10_zeroval_volatile: +; GISel-WITHOUT-MOPS: // %bb.0: // %entry +; GISel-WITHOUT-MOPS-NEXT: str xzr, [x0] +; GISel-WITHOUT-MOPS-NEXT: strh wzr, [x0, #8] +; GISel-WITHOUT-MOPS-NEXT: ret ; -; GISel-MOPS-O3-LABEL: memset_10_zeroval_volatile: -; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, xzr -; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, xzr -; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, xzr -; GISel-MOPS-O3-NEXT: ret +; GISel-MOPS-LABEL: memset_10_zeroval_volatile: +; GISel-MOPS: // %bb.0: // %entry +; GISel-MOPS-NEXT: str xzr, [x0] +; GISel-MOPS-NEXT: strh wzr, [x0, #8] +; GISel-MOPS-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_zeroval_volatile: ; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry @@ -490,43 +461,46 @@ entry: define void @memset_10_volatile(ptr %dst, i32 %value) { ; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_volatile: ; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: bl memset -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8 +; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1 +; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff +; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101 +; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9 +; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8 +; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-WITHOUT-MOPS-O0-NEXT: ret ; ; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_volatile: ; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memset -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1 +; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff +; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8 +; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8] ; GISel-WITHOUT-MOPS-O3-NEXT: ret ; ; GISel-MOPS-O0-LABEL: memset_10_volatile: ; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: // implicit-def: $x9 -; GISel-MOPS-O0-NEXT: mov w9, w1 -; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9 +; GISel-MOPS-O0-NEXT: // implicit-def: $x8 +; GISel-MOPS-O0-NEXT: mov w8, w1 +; GISel-MOPS-O0-NEXT: and x8, x8, #0xff +; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101 +; GISel-MOPS-O0-NEXT: mul x8, x8, x9 +; GISel-MOPS-O0-NEXT: str x8, [x0] +; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8 +; GISel-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-MOPS-O0-NEXT: ret ; ; GISel-MOPS-O3-LABEL: memset_10_volatile: ; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa ; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1 -; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, x1 -; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, x1 -; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, x1 +; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; GISel-MOPS-O3-NEXT: and x9, x1, #0xff +; GISel-MOPS-O3-NEXT: mul x8, x9, x8 +; GISel-MOPS-O3-NEXT: str x8, [x0] +; GISel-MOPS-O3-NEXT: strh w8, [x0, #8] ; GISel-MOPS-O3-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_volatile: @@ -905,43 +879,21 @@ entry: } define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) { -; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_10_volatile: -; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: bl memcpy -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O0-NEXT: ret -; -; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_10_volatile: -; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memcpy -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O3-NEXT: ret -; -; GISel-MOPS-O0-LABEL: memcpy_10_volatile: -; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: cpyfp [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpyfm [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpyfe [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: ret +; GISel-WITHOUT-MOPS-LABEL: memcpy_10_volatile: +; GISel-WITHOUT-MOPS: // %bb.0: // %entry +; GISel-WITHOUT-MOPS-NEXT: ldr x8, [x1] +; GISel-WITHOUT-MOPS-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-NEXT: ldrh w8, [x1, #8] +; GISel-WITHOUT-MOPS-NEXT: strh w8, [x0, #8] +; GISel-WITHOUT-MOPS-NEXT: ret ; -; GISel-MOPS-O3-LABEL: memcpy_10_volatile: -; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O3-NEXT: cpyfp [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpyfm [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpyfe [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: ret +; GISel-MOPS-LABEL: memcpy_10_volatile: +; GISel-MOPS: // %bb.0: // %entry +; GISel-MOPS-NEXT: ldr x8, [x1] +; GISel-MOPS-NEXT: str x8, [x0] +; GISel-MOPS-NEXT: ldrh w8, [x1, #8] +; GISel-MOPS-NEXT: strh w8, [x0, #8] +; GISel-MOPS-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_10_volatile: ; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry @@ -1736,40 +1688,34 @@ entry: define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) { ; GISel-WITHOUT-MOPS-O0-LABEL: memmove_10_volatile: ; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: bl memmove -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O0-NEXT: ldr x9, [x1] +; GISel-WITHOUT-MOPS-O0-NEXT: ldrh w8, [x1, #8] +; GISel-WITHOUT-MOPS-O0-NEXT: str x9, [x0] +; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-WITHOUT-MOPS-O0-NEXT: ret ; ; GISel-WITHOUT-MOPS-O3-LABEL: memmove_10_volatile: ; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memmove -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O3-NEXT: ldr x8, [x1] +; GISel-WITHOUT-MOPS-O3-NEXT: ldrh w9, [x1, #8] +; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-O3-NEXT: strh w9, [x0, #8] ; GISel-WITHOUT-MOPS-O3-NEXT: ret ; ; GISel-MOPS-O0-LABEL: memmove_10_volatile: ; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: cpyp [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpym [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpye [x0]!, [x1]!, x8! +; GISel-MOPS-O0-NEXT: ldr x9, [x1] +; GISel-MOPS-O0-NEXT: ldrh w8, [x1, #8] +; GISel-MOPS-O0-NEXT: str x9, [x0] +; GISel-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-MOPS-O0-NEXT: ret ; ; GISel-MOPS-O3-LABEL: memmove_10_volatile: ; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O3-NEXT: cpyp [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpym [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpye [x0]!, [x1]!, x8! +; GISel-MOPS-O3-NEXT: ldr x8, [x1] +; GISel-MOPS-O3-NEXT: ldrh w9, [x1, #8] +; GISel-MOPS-O3-NEXT: str x8, [x0] +; GISel-MOPS-O3-NEXT: strh w9, [x0, #8] ; GISel-MOPS-O3-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memmove_10_volatile: