From 4d2b85447340a50b46077ca0bddac1a3ab8b7139 Mon Sep 17 00:00:00 2001 From: Hamza Khallouki Date: Mon, 28 Apr 2025 13:14:28 +0100 Subject: [PATCH 1/2] [AIE2P] Add AIE2P run line in AIEClusterBaseAddress test --- llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec.mir | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec.mir b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec.mir index affd07ffc1f5..abce309da43e 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec.mir @@ -4,8 +4,9 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -mtriple aie2 -run-pass=aie-cluster-base-address --aie-chain-addr-vec-ldst --aie-chain-addr-scl-ldst=false %s -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple aie2p -run-pass=aie-cluster-base-address --aie-chain-addr-vec-ldst --aie-chain-addr-scl-ldst=false %s -verify-machineinstrs -o - | FileCheck %s # In this test, we chain the ptradds until we reach a store, then break the chain # to not create data dependencies later on. From 5230bcf563cc5c81f95762868644c09012a559a9 Mon Sep 17 00:00:00 2001 From: Hamza Khallouki Date: Mon, 28 Apr 2025 13:24:29 +0100 Subject: [PATCH 2/2] [AIE2P] Allow more address chaining and add a profitability check heuristic This allows chaining when the base pointer is used in other basic blocks but only when it is considered profitable: - This shouldn't happen in a loop as the resulting copy will be more costly - The cost of chaining is incremented for each offset falling outside the load/store immediate ranges. - An experimental threshold is used to determine if chaining is profitable based on the compute cost (e.g half the number of pointer adds to be chained) --- llvm/lib/Target/AIE/AIE2InstrInfo.cpp | 12 ++ llvm/lib/Target/AIE/AIE2InstrInfo.h | 3 + llvm/lib/Target/AIE/AIEBaseInstrInfo.h | 10 + llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp | 133 +++++++++++- llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp | 12 ++ llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h | 3 + .../cluster-base-address-vec-heuristic.mir | 190 ++++++++++++++++++ 7 files changed, 358 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec-heuristic.mir diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp index 2731ceeaf8ea..bceee56b2900 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.cpp @@ -1650,3 +1650,15 @@ unsigned AIE2InstrInfo::getBasicVectorBitSize() const { return 512; } unsigned AIE2InstrInfo::getMaxVectorBitSize() const { return 1024; } unsigned AIE2InstrInfo::getMaxSupportedLdStIncSize() const { return 512; } + +AIEBaseInstrInfo::ImmediateRangeBounds +AIE2InstrInfo::getLoadStorePostIncImmediateRange(LLT MemType) const { + if (MemType.getSizeInBits() == 8) + return {7, -8}; + else if (MemType.getSizeInBits() == 16) + return {7, -8}; + else if (MemType.getSizeInBits() <= 32) + return {252, -256}; + else + llvm_unreachable("Unsupported"); +} diff --git a/llvm/lib/Target/AIE/AIE2InstrInfo.h b/llvm/lib/Target/AIE/AIE2InstrInfo.h index 436509ce5e9a..203f7df1008a 100644 --- a/llvm/lib/Target/AIE/AIE2InstrInfo.h +++ b/llvm/lib/Target/AIE/AIE2InstrInfo.h @@ -75,6 +75,9 @@ class AIE2InstrInfo : public AIE2GenInstrInfo { unsigned getMaxVectorBitSize() const override; unsigned getMaxSupportedLdStIncSize() const override; + ImmediateRangeBounds + getLoadStorePostIncImmediateRange(LLT MemType) const override; + virtual unsigned getNumReservedDelaySlots(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index acb4a9efacc0..9e38bd954aeb 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -638,6 +638,16 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { llvm_unreachable("Target didn't implement getMaxSupportedLdStIncSize!"); } + struct ImmediateRangeBounds { + int64_t ImmediateRangeMax; + int64_t ImmediateRangeMin; + }; + virtual ImmediateRangeBounds + getLoadStorePostIncImmediateRange(LLT MemType) const { + llvm_unreachable( + "Target didn't implement getLoadStorePostIncImmediateRange!"); + } + /// Abstract operations to help the decoding of complex operations. struct AbstractOp { enum class OperationType : unsigned { diff --git a/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp b/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp index 029c4c1ce405..abd4a3fe5c71 100644 --- a/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp +++ b/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -45,12 +45,15 @@ //===----------------------------------------------------------------------===// #include "AIE.h" +#include "AIEBaseInstrInfo.h" +#include "Utils/AIELoopUtils.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/InitializePasses.h" @@ -72,6 +75,11 @@ static cl::opt EnableChainsForVectorLdSt( "aie-chain-addr-vec-ldst", cl::Hidden, cl::init(true), cl::desc("Enable ptradd chaining for vector loads and stores.")); +cl::opt AddressChainCostLimit( + "aie-chain-cost-limit", + cl::desc("Maximum allowed cost for pointer add chains"), cl::init(-1), + cl::Hidden); + namespace { /// Try and re-order PTR_ADD instructions to maximise the size of constant @@ -163,6 +171,8 @@ class AIEClusterBaseAddress : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.setPreservesAll(); } @@ -223,10 +233,123 @@ class AIEClusterBaseAddress : public MachineFunctionPass { if (Instrs.size() <= 1) return true; - // If the base reg is used in any of the successive MBBs, then we don't - // want to chain the corresponding ptr adds, since this would introduce a - // COPY and increase reg pressure. - return isRegUsedInSuccessiveMBBs(&MBB, PtrReg); + // If the base reg is used in any of the successive MBBs, would introduce a + // COPY and increase reg pressure. We only skip chaining in this case if it + // is considered unprofitable. + if (isRegUsedInSuccessiveMBBs(&MBB, PtrReg) && + !isChainingProfitable(PtrReg, Instrs, MBB)) + return true; + + return false; + } + + // Decide heuristically if chaining will be profitable + bool isChainingProfitable(Register PtrReg, + const SmallVector &Instrs, + MachineBasicBlock &MBB) { + const TargetSubtargetInfo &ST = MBB.getParent()->getSubtarget(); + const AIEBaseInstrInfo *TII = + static_cast(ST.getInstrInfo()); + using OffsetType = std::variant; + assert(Instrs.size() > 1); + + bool InLoop = true; + MachineLoopInfo &MLI = getAnalysis(); + MachineLoop *ToLoop = MLI.getLoopFor(&MBB); + if (!ToLoop) + InLoop = false; + + unsigned ChainedCost = 0; + unsigned ChainedCostLimit = Instrs.size() / 2; // Experimental threshold + + if (AddressChainCostLimit > -1) { + ChainedCostLimit = AddressChainCostLimit; + } + + if (isRegUsedInSuccessiveMBBs(&MBB, PtrReg)) { + if (InLoop) + return false; // A copy in a loop is costly + ChainedCost += 1; // Add cost of resulting copy + } + + int64_t ImmediateRangeMax = 0; + int64_t ImmediateRangeMin = 0; + bool ImmediateRangeSet = false; + int64_t AccumulatedOffset = 0; + int64_t NewOffset; + SmallVector Offsets; + + for (unsigned I = 0; I < Instrs.size() - 1; I++) { + MachineInstr *MI = Instrs[I]; + MachineInstr *MINext = Instrs[I + 1]; + + const Register PtrReg = MI->getOperand(0).getReg(); + for (const MachineInstr &UseMI : MRI->use_instructions(PtrReg)) { + if (ImmediateRangeSet) + continue; // Check first use only + if (!UseMI.mayLoadOrStore()) + continue; + const LLT MemType = getLoadStoreType(UseMI); + // Immediate ranges for vectors are sufficient so we + // assume chaining is always profitable. + if (MemType.isVector()) { + return true; + } else { + if (MemType.getSizeInBits() <= 32) { + ImmediateRangeMax = TII->getLoadStorePostIncImmediateRange(MemType) + .ImmediateRangeMax; + ImmediateRangeMin = TII->getLoadStorePostIncImmediateRange(MemType) + .ImmediateRangeMin; + ImmediateRangeSet = true; + } else { + llvm_unreachable( + "unreachable: Unsupported immediate range of scalar size "); + } + } + } + + // If the immediate range is not set, the pointers aren't used by any + // loads and stores, so we return. + if (!ImmediateRangeSet) { + assert(ImmediateRangeMin == 0 && ImmediateRangeMax == 0); + return false; + } + + auto OffsetMI = + getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), *MRI); + auto OffsetMINext = getIConstantVRegValWithLookThrough( + MINext->getOperand(2).getReg(), *MRI); + + if (shouldBreakChain(MI, MINext, OffsetMI, OffsetMINext)) { + ChainedCost++; + AccumulatedOffset = 0; + Offsets.push_back("Break"); + continue; + } + + const int64_t CurrOffset = OffsetMI->Value.getSExtValue(); + const int64_t NextOffset = OffsetMINext->Value.getSExtValue(); + + assert(I == 0 || !Offsets.empty()); + AccumulatedOffset += + (I == 0 || (std::holds_alternative(Offsets.back()) && + std::get(Offsets.back()) == "Break")) + ? CurrOffset + : NewOffset; + Offsets.push_back( + (I == 0 || (std::holds_alternative(Offsets.back()) && + std::get(Offsets.back()) == "Break")) + ? CurrOffset + : OffsetType(NewOffset)); + + NewOffset = NextOffset - AccumulatedOffset; + + if (NewOffset < ImmediateRangeMin || NewOffset > ImmediateRangeMax) { + ChainedCost += 1; // Immediate materialization cost + } + } + + return ChainedCostLimit > ChainedCost; } // Build a chain (or set of chains) of G_PTR_ADDs. We consider as diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp index 49d892737bf3..7870a5a0a839 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp @@ -1812,6 +1812,18 @@ unsigned AIE2PInstrInfo::getMaxVectorBitSize() const { return 2048; } unsigned AIE2PInstrInfo::getMaxSupportedLdStIncSize() const { return 2048; } +AIEBaseInstrInfo::ImmediateRangeBounds +AIE2PInstrInfo::getLoadStorePostIncImmediateRange(LLT MemType) const { + if (MemType.getSizeInBits() == 8) + return {7, -8}; + else if (MemType.getSizeInBits() == 16) + return {14, -16}; + else if (MemType.getSizeInBits() <= 32) + return {28, -32}; + else + llvm_unreachable("Unsupported"); +} + using AbstractOp = AIEBaseInstrInfo::AbstractOp; std::optional diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h index e763362be4d7..0991ab08c024 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h @@ -77,6 +77,9 @@ class AIE2PInstrInfo : public AIE2PGenInstrInfo { unsigned getMaxVectorBitSize() const override; unsigned getMaxSupportedLdStIncSize() const override; + ImmediateRangeBounds + getLoadStorePostIncImmediateRange(LLT MemType) const override; + virtual unsigned getNumReservedDelaySlots(const MachineInstr &MI) const override; diff --git a/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec-heuristic.mir b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec-heuristic.mir new file mode 100644 index 000000000000..54f9fd5f82c6 --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-vec-heuristic.mir @@ -0,0 +1,190 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2p -start-before=aie-cluster-base-address -stop-after=postmisched --issue-limit=6 --aie-chain-addr-scl-ldst=true %s -verify-machineinstrs -o - | FileCheck %s + + + +# The store offsets exceed the immediate range of half scalar load/store requiring moves to materialize them +# Occupying the A slot with the loads forces the immediate moves to be hidden in the MOV slot, making chaining less profitable +--- +name: prevent_unprofitable_chaining +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: prevent_unprofitable_chaining + ; CHECK: bb.0 (align 16): + ; CHECK-NEXT: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r0, renamable $p0, 0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 96) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $r0 = MOVA 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $dj0, implicit $p1 { + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: renamable $dj0 = MOV_alu_mv_mv_mv_cg 16 + ; CHECK-NEXT: } + ; CHECK-NEXT: ST_s16_idx killed renamable $r1, renamable $p0, killed renamable $dj0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 80) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $dj0, implicit $p1 { + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: renamable $dj0 = MOV_alu_mv_mv_mv_cg 32 + ; CHECK-NEXT: } + ; CHECK-NEXT: ST_s16_idx killed renamable $r1, renamable $p0, killed renamable $dj0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 64, align 32) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $dj0, implicit $p1 { + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: renamable $dj0 = MOV_alu_mv_mv_mv_cg 64 + ; CHECK-NEXT: } + ; CHECK-NEXT: ST_s16_idx killed renamable $r1, renamable $p0, killed renamable $dj0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 48) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: BUNDLE implicit-def $r1, implicit-def $dj0, implicit killed $p1 { + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm killed renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: renamable $dj0 = MOV_alu_mv_mv_mv_cg 96 + ; CHECK-NEXT: } + ; CHECK-NEXT: ST_s16_idx killed renamable $r1, renamable $p0, killed renamable $dj0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 32) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r0, killed renamable $p0, 0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16)) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + bb.0: + liveins: $p0, $p1 + %0:_(p0) = COPY $p0 + %101:_(p0) = COPY $p1 + %8:_(s32) = G_CONSTANT i32 0 + G_STORE %8:_(s32), %0:_(p0) :: (store (s16) into stack - 96) + %1:_(s20) = G_CONSTANT i20 16 + %2:_(p0) = G_PTR_ADD %0:_, %1:_(s20) + %81:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %81:_(s32), %2:_(p0) :: (store (s16) into stack - 80) + %4:_(s20) = G_CONSTANT i20 32 + %5:_(p0) = G_PTR_ADD %0:_, %4:_(s20) + %82:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %82:_(s32), %5:_(p0) :: (store (s16) into stack - 64, align 32) + %6:_(s20) = G_CONSTANT i20 64 + %7:_(p0) = G_PTR_ADD %0:_, %6:_(s20) + %83:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %83:_(s32), %7:_(p0) :: (store (s16) into stack - 48) + %9:_(s20) = G_CONSTANT i20 96 + %10:_(p0) = G_PTR_ADD %0:_, %9:_(s20) + %84:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %84:_(s32), %10:_(p0) :: (store (s16) into stack - 32) + bb.1: + G_STORE %8:_(s32), %0:_(p0) :: (store (s16) into unknown-address) +... + +# The store offsets fit the immediate range of half scalar load/store +--- +name: allow_profitable_chaining +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: allow_profitable_chaining + ; CHECK: bb.0 (align 16): + ; CHECK-NEXT: liveins: $p0, $p1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r0, renamable $p0, 0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 96) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $r0 = MOVA 0 + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r1, renamable $p0, 4, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 80) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r1, renamable $p0, 4, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 64, align 32) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r1, renamable $p0, 8, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 48) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: renamable $r1 = LDA_dms_lda_idx_imm killed renamable $p1, 0 :: (load (s32)) + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r1, renamable $p0, 12, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16) into stack - 32) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: ST_s16_idx_imm killed renamable $r0, killed renamable $p0, 0, implicit-def $pe2_ads, implicit $pe2_ads :: (store (s16)) + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP + bb.0: + liveins: $p0, $p1 + %0:_(p0) = COPY $p0 + %101:_(p0) = COPY $p1 + %8:_(s32) = G_CONSTANT i32 0 + G_STORE %8:_(s32), %0:_(p0) :: (store (s16) into stack - 96) + %1:_(s20) = G_CONSTANT i20 4 + %2:_(p0) = G_PTR_ADD %0:_, %1:_(s20) + %81:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %81:_(s32), %2:_(p0) :: (store (s16) into stack - 80) + %4:_(s20) = G_CONSTANT i20 4 + %5:_(p0) = G_PTR_ADD %0:_, %4:_(s20) + %82:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %82:_(s32), %5:_(p0) :: (store (s16) into stack - 64, align 32) + %6:_(s20) = G_CONSTANT i20 8 + %7:_(p0) = G_PTR_ADD %0:_, %6:_(s20) + %83:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %83:_(s32), %7:_(p0) :: (store (s16) into stack - 48) + %9:_(s20) = G_CONSTANT i20 12 + %10:_(p0) = G_PTR_ADD %0:_, %9:_(s20) + %84:_(s32) = G_LOAD %101:_(p0) :: (load (s32) from unknown-address) + G_STORE %84:_(s32), %10:_(p0) :: (store (s16) into stack - 32) + bb.1: + G_STORE %8:_(s32), %0:_(p0) :: (store (s16) into unknown-address) +... +