From afdc992206d5f70432239a2ff8e382bbcf262135 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Thu, 9 Oct 2025 07:40:51 -0600 Subject: [PATCH 1/9] [AIE2P] Extend allocation filter to include only Target MIs Now we filter by register class and usage. Basically, we exclude here instructions like copies and non-2D/3D ones. Co-Authored-By: Krishnam Tibrewala --- llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp | 5 +++ .../Target/AIE/aie2p/AIE2PTargetMachine.cpp | 36 ++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp index 1e2d1f19110c..ae472b53094e 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp @@ -96,6 +96,11 @@ cl::opt EnableStagedRA("aie-staged-ra", cl::Hidden, cl::init(true), cl::desc("Enable multi-stage register allocation")); +cl::opt EnableFineGrainedStagedRA( + "aie-staged-ra-fine-grained-alloc", cl::Hidden, cl::init(true), + cl::desc("Enable multi-stage register allocation with fine-grained " + "selection of live intervals")); + cl::opt EnableWAWRegRewrite("aie-wawreg-rewrite", cl::desc("Enable the WAW Register Renaming in loops"), diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp index 7a041a7e20db..357feb20b699 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -15,6 +15,7 @@ #include "AIE2PTargetMachine.h" #include "AIE2PTargetTransformInfo.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" using namespace llvm; extern cl::opt EnableStagedRA; @@ -25,6 +26,7 @@ extern cl::opt EnableAddressChaining; extern cl::opt EnableGlobalPtrModOptimizer; extern cl::opt EnableWAWRegRewrite; extern cl::opt EnableAIEIfConversion; +extern cl::opt EnableFineGrainedStagedRA; void AIE2PTargetMachine::anchor() {} @@ -60,17 +62,43 @@ void AIE2PPassConfig::addPreRegBankSelect() { } } +static bool isRegUsedBy2DOr3DInstruction(const MachineRegisterInfo &MRI, + const Register &R) { + + return llvm::any_of( + MRI.use_nodbg_instructions(R), [&](const MachineInstr &MI) { + auto &TII = *static_cast( + MI.getMF()->getSubtarget().getInstrInfo()); + + // We should recognize both cases, with and without splitting. A 2D/3D + // instruction will always be split os splittable. + return TII.getOpcodeWithTupleOperands(MI.getOpcode()).has_value() || + TII.getOpcodeWithAtomicOperands(MI.getOpcode()).has_value(); + }); +} + static bool onlyAllocate3DRegisters(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register &R) { - return AIE2P::eDSRegClass.hasSubClassEq(MRI.getRegClass(R)); + + const TargetRegisterClass *RegClass = MRI.getRegClass(R); + if (!AIE2P::eDSRegClass.hasSubClassEq(RegClass)) + return false; + return EnableFineGrainedStagedRA ? isRegUsedBy2DOr3DInstruction(MRI, R) + : true; } + static bool onlyAllocate3D2DRegisters(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register &R) { - return AIE2P::eDSRegClass.hasSubClassEq(MRI.getRegClass(R)) || - AIE2P::eDRegClass.hasSubClassEq(MRI.getRegClass(R)); + const TargetRegisterClass *RegClass = MRI.getRegClass(R); + if (!AIE2P::eDSRegClass.hasSubClassEq(RegClass) && + !AIE2P::eDRegClass.hasSubClassEq(RegClass)) + return false; + return EnableFineGrainedStagedRA ? isRegUsedBy2DOr3DInstruction(MRI, R) + : true; } + static bool onlyAllocateMRegisters(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register &R) { From 12da76d5162dc3a58935479b9f794c96f3c492e6 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Fri, 10 Oct 2025 02:28:57 -0600 Subject: [PATCH 2/9] [AIEX] Fix LIs in SuperRegRewriter that are related to unallocated registers Co-Authored-By: Krishnam Tibrewala --- llvm/lib/Target/AIE/AIESuperRegRewriter.cpp | 32 +++++++++++++++++-- .../CodeGen/AIE/aie2p/ra/staged-ra-spill.mir | 16 +++++----- llvm/test/CodeGen/AIE/staged-ra-rewrite.mir | 27 +++++++++------- 3 files changed, 53 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp b/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp index 433aef018cbe..9c31ef7c91f7 100644 --- a/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp +++ b/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// @@ -153,6 +153,7 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { // Collect already-assigned VRegs that can be split into smaller ones. LLVM_DEBUG(VRM.dump()); + LLVM_DEBUG(LIS.dump()); for (unsigned VRegIdx = 0, End = MRI.getNumVirtRegs(); VRegIdx != End; ++VRegIdx) { Register Reg = Register::index2VirtReg(VRegIdx); @@ -208,7 +209,8 @@ static LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, /// Rewrite a full copy into multiple copies using the subregs in \p CopySubRegs static void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, LiveIntervals &LIS, const TargetInstrInfo &TII, - const TargetRegisterInfo &TRI) { + const TargetRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM) { assert(MI.isFullCopy()); SlotIndex CopyIndex = LIS.getInstructionIndex(MI); LLVM_DEBUG(dbgs() << " Changing full copy at " << CopyIndex << ": " << MI); @@ -217,6 +219,8 @@ static void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, LaneBitmask LiveSrcLanes = getLiveLanesAt(CopyIndex, SrcReg, LIS); LIS.removeVRegDefAt(LIS.getInterval(DstReg), CopyIndex.getRegSlot()); + + SmallSet RegistersToRepair; for (int SubRegIdx : CopySubRegs) { if ((LiveSrcLanes & TRI.getSubRegIndexLaneMask(SubRegIdx)).none()) { LLVM_DEBUG(dbgs() << " Skip undef subreg " @@ -232,10 +236,32 @@ static void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, LLVM_DEBUG(dbgs() << " to " << *PartCopy); LIS.InsertMachineInstrInMaps(*PartCopy); LIS.getInterval(PartCopy->getOperand(0).getReg()); + RegistersToRepair.insert(PartCopy->getOperand(1).getReg()); } LIS.RemoveMachineInstrFromMaps(MI); MI.eraseFromParent(); + // As we don't handle all registers now (selective LI filter), + // We should make sure that all LiveIntervals are correct. + // If we dont't repair, MI will compose the LIs of some registers, + // what is not correct because MI was deleted. + for (Register R : RegistersToRepair) { + + if (!LIS.hasInterval(R)) + continue; + + if (VRM.hasPhys(R)) { + const MCRegister PhysReg = VRM.getPhys(R); + const LiveInterval &OldLI = LIS.getInterval(R); + LRM.unassign(OldLI); + LIS.removeInterval(R); + const LiveInterval &LI = LIS.createAndComputeVirtRegInterval(R); + LRM.assign(LI, PhysReg); + } else { + LIS.removeInterval(R); + LIS.createAndComputeVirtRegInterval(R); + } + } } void AIESuperRegRewriter::rewriteSuperReg( @@ -260,7 +286,7 @@ void AIESuperRegRewriter::rewriteSuperReg( for (MachineInstr &MI : make_early_inc_range(MRI.reg_instructions(Reg))) { if (MI.isFullCopy()) rewriteFullCopy(MI, TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()), - LIS, *TII, TRI); + LIS, *TII, TRI, VRM, LRM); } LLVM_DEBUG(dbgs() << " Splitting range " << LIS.getInterval(Reg) << "\n"); diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir index 3c040f002206..b40dbeee2ddd 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs --aie-staged-ra -start-before=greedy -stop-after=virtregrewriter %s -o - \ # RUN: | FileCheck %s --check-prefix=RA @@ -24,23 +24,23 @@ body: | ; RA-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 0 ; RA-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 4 ; RA-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 8 - ; RA-NEXT: ST_D_SPILL renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-NEXT: ST_D_SPILL renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) ; RA-NEXT: renamable $dj0 = MOV_PD_imm11_pseudo 12 ; RA-NEXT: renamable $r0 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) ; RA-NEXT: renamable $dc0 = COPY killed renamable $r0 - ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) ; RA-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 16 ; RA-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 20 ; RA-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 24 ; RA-NEXT: renamable $dc0 = LDA_dms_lda_idx_imm killed renamable $p1, 28 - ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) - ; RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) - ; RA-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 + ; RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; RA-NEXT: ST_D_SPILL renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) ; RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 %20:ep = COPY $p0 %21:ep = COPY $p1 diff --git a/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir b/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir index a5bf4107fc8a..9f8898f794f9 100644 --- a/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir +++ b/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -O2 -mtriple=aie2 -verify-machineinstrs -run-pass=greedy,aie-superreg-rewrite %s -o - | FileCheck %s --check-prefix=AIE2-VREGS # RUN: llc -O2 -mtriple=aie2 -verify-machineinstrs --aie-staged-ra -start-before=greedy -stop-after=virtregrewriter %s -o - \ @@ -291,15 +291,16 @@ body: | ; AIE2P-RA-NEXT: renamable $r0 = LDA_dms_lda_idx_imm renamable $p1, 4 ; AIE2P-RA-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 8 ; AIE2P-RA-NEXT: renamable $r2 = LDA_dms_lda_idx_imm killed renamable $p1, 12 - ; AIE2P-RA-NEXT: renamable $dn1 = COPY killed renamable $r0 - ; AIE2P-RA-NEXT: renamable $dj1 = COPY killed renamable $r1 - ; AIE2P-RA-NEXT: renamable $dc1 = COPY killed renamable $r2 + ; AIE2P-RA-NEXT: renamable $dn0 = COPY killed renamable $r0 + ; AIE2P-RA-NEXT: renamable $m0 = COPY killed renamable $m1 + ; AIE2P-RA-NEXT: renamable $dj0 = COPY killed renamable $r1 + ; AIE2P-RA-NEXT: renamable $dc0 = COPY killed renamable $r2 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: bb.1: - ; AIE2P-RA-NEXT: liveins: $dc1, $dj1, $dn1, $m1, $p0 + ; AIE2P-RA-NEXT: liveins: $dc0, $dj0, $dn0, $m0, $p0 ; AIE2P-RA-NEXT: {{ $}} - ; AIE2P-RA-NEXT: $p0, $dc1 = PADDA_2D_split killed $p0, killed $m1, killed $dn1, killed $dj1, killed $dc1 - ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc1 + ; AIE2P-RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0 bb.1.entry: liveins: $p0, $p1, $d1 %20:ep = COPY $p0 @@ -444,10 +445,14 @@ body: | ; AIE2P-RA-NEXT: renamable $dc1 = MOV_PD_imm11_pseudo 0 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: bb.1: - ; AIE2P-RA-NEXT: liveins: $dc1, $dj1, $dn1, $m1, $p0 + ; AIE2P-RA-NEXT: liveins: $d1:0x0000000000200E00, $p0 ; AIE2P-RA-NEXT: {{ $}} - ; AIE2P-RA-NEXT: $p0, $dc1 = PADDA_2D_split killed $p0, killed $m1, killed $dn1, killed $dj1, killed $dc1 - ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc1 + ; AIE2P-RA-NEXT: renamable $dc0 = COPY renamable $dc1 + ; AIE2P-RA-NEXT: renamable $dn0 = COPY renamable $dn1 + ; AIE2P-RA-NEXT: renamable $dj0 = COPY renamable $dj1 + ; AIE2P-RA-NEXT: renamable $m0 = COPY killed renamable $m1 + ; AIE2P-RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0 bb.1.entry: liveins: $p0, $m1, $dn1, $dj1 %20:ep = COPY $p0 @@ -623,7 +628,7 @@ body: | ; AIE2P-RA-NEXT: renamable $dc0 = LDA_dms_lda_idx_imm killed renamable $p1, 12 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: bb.1: - ; AIE2P-RA-NEXT: liveins: $dc0, $dj0, $dn0, $p0 + ; AIE2P-RA-NEXT: liveins: $d0:0x0000000000000E00, $p0 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, undef $m0, killed $dn0, killed $dj0, killed $dc0 ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0 From 23c18b4bec02e2838f6cf7637334c0aae71f47d2 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Thu, 9 Oct 2025 07:35:35 -0600 Subject: [PATCH 3/9] [AIEX][NFC] Refator some AIESuperReg utilities Co-Authored-By: Krishnam Tibrewala --- llvm/lib/Target/AIE/AIESuperRegRewriter.cpp | 238 +--------------- llvm/lib/Target/AIE/AIESuperRegUtils.cpp | 265 ++++++++++++++++++ llvm/lib/Target/AIE/AIESuperRegUtils.h | 81 ++++++ llvm/lib/Target/AIE/CMakeLists.txt | 1 + .../Target/AIE/aie2p/AIE2PTargetMachine.cpp | 26 +- 5 files changed, 366 insertions(+), 245 deletions(-) create mode 100644 llvm/lib/Target/AIE/AIESuperRegUtils.cpp create mode 100644 llvm/lib/Target/AIE/AIESuperRegUtils.h diff --git a/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp b/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp index 9c31ef7c91f7..44e84037df91 100644 --- a/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp +++ b/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp @@ -10,6 +10,7 @@ #include "AIEBaseInstrInfo.h" #include "AIEBaseRegisterInfo.h" +#include "AIESuperRegUtils.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallSet.h" @@ -63,80 +64,8 @@ class AIESuperRegRewriter : public MachineFunctionPass { } bool runOnMachineFunction(MachineFunction &Fn) override; - -private: - void rewriteSuperReg(Register Reg, Register AssignedPhysReg, - MachineRegisterInfo &MRI, const AIEBaseRegisterInfo &TRI, - VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, - SlotIndexes &Indexes, LiveDebugVariables &DebugVars); }; -/// Returns the subreg indices that can be used to rewrite \p Reg into smaller -/// regs. Returns {} if the rewrite isn't possible. -static SmallSet getRewritableSubRegs(Register Reg, - const MachineRegisterInfo &MRI, - const AIEBaseRegisterInfo &TRI, - std::set &VisitedVRegs) { - if (Reg.isPhysical()) { - // TODO: One could use collectSubRegs() in AIEBaseInstrInfo.cpp - // But given that MOD registers are not part of the ABI, they should - // not appear as physical registers before RA. - LLVM_DEBUG(dbgs() << " Cannot rewrite physreg " << printReg(Reg, &TRI) - << "\n"); - return {}; - } - - auto &SubRegSplit = TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()); - if (SubRegSplit.size() <= 1) { - // Register does not have multiple subregs to be rewritten into. - LLVM_DEBUG(dbgs() << " Cannot rewrite " << printReg(Reg, &TRI, 0, &MRI) - << ": no sub-reg split\n"); - return {}; - } - - VisitedVRegs.insert(Reg); - SmallSet UsedSubRegs; - for (MachineOperand &RegOp : MRI.reg_operands(Reg)) { - int SubReg = RegOp.getSubReg(); - if (SubReg && SubRegSplit.count(SubReg)) { - UsedSubRegs.insert(SubReg); - } else if (RegOp.getParent()->isFullCopy()) { - // To rewrite a full copy, both operands need to be rewritable using - // their subregs. - Register DstReg = RegOp.getParent()->getOperand(0).getReg(); - if (!VisitedVRegs.count(DstReg) && - getRewritableSubRegs(DstReg, MRI, TRI, VisitedVRegs).empty()) { - LLVM_DEBUG(dbgs() << " Cannot rewrite " - << printReg(DstReg, &TRI, 0, &MRI) << " in " - << *RegOp.getParent()); - return {}; - } - Register SrcReg = RegOp.getParent()->getOperand(1).getReg(); - if (!VisitedVRegs.count(SrcReg) && - getRewritableSubRegs(SrcReg, MRI, TRI, VisitedVRegs).empty()) { - LLVM_DEBUG(dbgs() << " Cannot rewrite " - << printReg(SrcReg, &TRI, 0, &MRI) << " in " - << *RegOp.getParent()); - return {}; - } - UsedSubRegs.insert(SubRegSplit.begin(), SubRegSplit.end()); - } else { - LLVM_DEBUG(dbgs() << " Cannot rewrite " << RegOp << " in " - << *RegOp.getParent()); - return {}; - } - } - - return UsedSubRegs; -} - -static SmallSet getRewritableSubRegs(Register Reg, - const MachineRegisterInfo &MRI, - const AIEBaseRegisterInfo &TRI) { - std::set VisitedVRegs; - return getRewritableSubRegs(Reg, MRI, TRI, VisitedVRegs); -} - bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(llvm::dbgs() << "*** Splitting super-registers: " << MF.getName() << " ***\n"); @@ -149,7 +78,7 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { LiveIntervals &LIS = getAnalysis().getLIS(); SlotIndexes &Indexes = getAnalysis().getSI(); LiveDebugVariables &DebugVars = getAnalysis().getLDV(); - std::map AssignedPhysRegs; + std::map>> AssignedPhysRegs; // Collect already-assigned VRegs that can be split into smaller ones. LLVM_DEBUG(VRM.dump()); @@ -173,8 +102,11 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Analysing " << printReg(Reg, &TRI, 0, &MRI) << ":" << printRegClassOrBank(Reg, MRI, &TRI) << '\n'); - if (!getRewritableSubRegs(Reg, MRI, TRI).empty()) { - AssignedPhysRegs[Reg] = VRM.getPhys(Reg); + SmallSet RewritableSubRegs = + AIESuperRegUtils::getRewritableSubRegs(Reg, MRI, TRI); + if (!RewritableSubRegs.empty()) { + AssignedPhysRegs[Reg] = + std::make_pair(VRM.getPhys(Reg), RewritableSubRegs); LRM.unassign(LIS.getInterval(Reg)); } else { LLVM_DEBUG(dbgs() << "Could not rewrite " << printReg(Reg, &TRI, 0, &MRI) @@ -183,163 +115,17 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { } // Re-write all the collected VRegs - for (auto &[VReg, PhysReg] : AssignedPhysRegs) { - rewriteSuperReg(VReg, PhysReg, MRI, TRI, VRM, LRM, LIS, Indexes, DebugVars); + for (auto &[VReg, PhysRegAndSubRegs] : AssignedPhysRegs) { + const Register PhysReg = PhysRegAndSubRegs.first; + SmallSet &SubRegs = PhysRegAndSubRegs.second; + AIESuperRegUtils::rewriteSuperReg(VReg, PhysReg, SubRegs, MRI, TRI, VRM, + LRM, LIS, Indexes, DebugVars); } LLVM_DEBUG(VRM.dump()); return !AssignedPhysRegs.empty(); } -/// Return a mask of all the lanes that are live at \p Index -static LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, - const LiveIntervals &LIS) { - const LiveInterval &LI = LIS.getInterval(Reg); - if (!LI.hasSubRanges()) - return LaneBitmask::getAll(); - - LaneBitmask LiveLanes; - for (const LiveInterval::SubRange &SubLI : LI.subranges()) { - if (SubLI.liveAt(Index)) - LiveLanes |= SubLI.LaneMask; - } - return LiveLanes; -} - -/// Rewrite a full copy into multiple copies using the subregs in \p CopySubRegs -static void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, - LiveIntervals &LIS, const TargetInstrInfo &TII, - const TargetRegisterInfo &TRI, VirtRegMap &VRM, - LiveRegMatrix &LRM) { - assert(MI.isFullCopy()); - SlotIndex CopyIndex = LIS.getInstructionIndex(MI); - LLVM_DEBUG(dbgs() << " Changing full copy at " << CopyIndex << ": " << MI); - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LaneBitmask LiveSrcLanes = getLiveLanesAt(CopyIndex, SrcReg, LIS); - - LIS.removeVRegDefAt(LIS.getInterval(DstReg), CopyIndex.getRegSlot()); - - SmallSet RegistersToRepair; - for (int SubRegIdx : CopySubRegs) { - if ((LiveSrcLanes & TRI.getSubRegIndexLaneMask(SubRegIdx)).none()) { - LLVM_DEBUG(dbgs() << " Skip undef subreg " - << TRI.getSubRegIndexName(SubRegIdx) << "\n"); - continue; - } - - MachineInstr *PartCopy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII.get(TargetOpcode::COPY)) - .addReg(DstReg, RegState::Define, SubRegIdx) - .addReg(SrcReg, 0, SubRegIdx) - .getInstr(); - LLVM_DEBUG(dbgs() << " to " << *PartCopy); - LIS.InsertMachineInstrInMaps(*PartCopy); - LIS.getInterval(PartCopy->getOperand(0).getReg()); - RegistersToRepair.insert(PartCopy->getOperand(1).getReg()); - } - - LIS.RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); - // As we don't handle all registers now (selective LI filter), - // We should make sure that all LiveIntervals are correct. - // If we dont't repair, MI will compose the LIs of some registers, - // what is not correct because MI was deleted. - for (Register R : RegistersToRepair) { - - if (!LIS.hasInterval(R)) - continue; - - if (VRM.hasPhys(R)) { - const MCRegister PhysReg = VRM.getPhys(R); - const LiveInterval &OldLI = LIS.getInterval(R); - LRM.unassign(OldLI); - LIS.removeInterval(R); - const LiveInterval &LI = LIS.createAndComputeVirtRegInterval(R); - LRM.assign(LI, PhysReg); - } else { - LIS.removeInterval(R); - LIS.createAndComputeVirtRegInterval(R); - } - } -} - -void AIESuperRegRewriter::rewriteSuperReg( - Register Reg, Register AssignedPhysReg, MachineRegisterInfo &MRI, - const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, LiveRegMatrix &LRM, - LiveIntervals &LIS, SlotIndexes &Indexes, LiveDebugVariables &DebugVars) { - LLVM_DEBUG(dbgs() << "Rewriting " << printReg(Reg, &TRI, 0, &MRI) << '\n'); - auto *TII = static_cast( - VRM.getMachineFunction().getSubtarget().getInstrInfo()); - - // Collect all the subreg indices to rewrite as independent vregs. - SmallMapVector SubRegToVReg; - const TargetRegisterClass *SuperRC = MRI.getRegClass(Reg); - SmallSet SubRegs = getRewritableSubRegs(Reg, MRI, TRI); - assert(!SubRegs.empty()); - for (int SubReg : SubRegs) { - const TargetRegisterClass *SubRC = TRI.getSubRegisterClass(SuperRC, SubReg); - SubRegToVReg[SubReg] = MRI.createVirtualRegister(SubRC); - } - - // Rewrite full copies into multiple copies using subregs - for (MachineInstr &MI : make_early_inc_range(MRI.reg_instructions(Reg))) { - if (MI.isFullCopy()) - rewriteFullCopy(MI, TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()), - LIS, *TII, TRI, VRM, LRM); - } - - LLVM_DEBUG(dbgs() << " Splitting range " << LIS.getInterval(Reg) << "\n"); - for (MachineOperand &RegOp : make_early_inc_range(MRI.reg_operands(Reg))) { - LLVM_DEBUG(dbgs() << " Changing " << *RegOp.getParent()); - int SubReg = RegOp.getSubReg(); - assert(SubReg); - RegOp.setReg(SubRegToVReg[SubReg]); - RegOp.setSubReg(0); - - // There might have been a write-undef due to only writing one sub-lane. - // Now that each sub-lane has its own VReg, the qualifier is invalid. - if (RegOp.isDef()) - RegOp.setIsUndef(false); - - // Make sure the right reg class is applied, some MIs might use compound - // classes with both 20 and 32 bits registers. - const TargetRegisterClass *OpRC = TII->getRegClass( - RegOp.getParent()->getDesc(), RegOp.getParent()->getOperandNo(&RegOp), - &TRI, VRM.getMachineFunction()); - MRI.constrainRegClass(SubRegToVReg[SubReg], OpRC); - - LLVM_DEBUG(dbgs() << " to " << *RegOp.getParent()); - } - - VRM.grow(); - LIS.removeInterval(Reg); - - for (auto &[SubRegIdx, VReg] : SubRegToVReg) { - MCRegister SubPhysReg = TRI.getSubReg(AssignedPhysReg, SubRegIdx); - LiveInterval &SubRegLI = LIS.getInterval(VReg); - LLVM_DEBUG(dbgs() << " Assigning Range: " << SubRegLI << '\n'); - - // By giving an independent VReg to each lane, we might have created - // multiple separate components. Give a VReg to each separate component. - SmallVector LIComponents; - LIS.splitSeparateComponents(SubRegLI, LIComponents); - LIComponents.push_back(&SubRegLI); - VRM.grow(); - - for (LiveInterval *LI : LIComponents) { - LRM.assign(*LI, SubPhysReg); - VRM.setRequiredPhys(LI->reg(), SubPhysReg); - LLVM_DEBUG(dbgs() << " Assigned " << printReg(LI->reg()) << "\n"); - } - } - - // Announce new VRegs so DBG locations can be updated. - auto NewVRegs = SmallVector(llvm::map_range( - SubRegToVReg, [&](auto &Mapping) { return Mapping.second; })); - DebugVars.splitRegister(Reg, NewVRegs, LIS); -} - } // end anonymous namespace char AIESuperRegRewriter::ID = 0; diff --git a/llvm/lib/Target/AIE/AIESuperRegUtils.cpp b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp new file mode 100644 index 000000000000..25959825c147 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp @@ -0,0 +1,265 @@ +//===- AIESuperRegUtils.cpp -----------------------------------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +#include "AIESuperRegUtils.h" +#include "AIEBaseInstrInfo.h" +#include "AIEBaseRegisterInfo.h" +#include "llvm/CodeGen/LiveDebugVariables.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aie-ra" + +namespace llvm::AIESuperRegUtils { + +/// Returns the subreg indices that can be used to rewrite \p Reg into smaller +/// regs. Returns {} if the rewrite isn't possible. +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, + std::set &VisitedVRegs) { + if (Reg.isPhysical()) { + // TODO: One could use collectSubRegs() in AIEBaseInstrInfo.cpp + // But given that MOD registers are not part of the ABI, they should + // not appear as physical registers before RA. + LLVM_DEBUG(dbgs() << " Cannot rewrite physreg " << printReg(Reg, &TRI) + << "\n"); + return {}; + } + + auto &SubRegSplit = TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()); + if (SubRegSplit.size() <= 1) { + // Register does not have multiple subregs to be rewritten into. + LLVM_DEBUG(dbgs() << " Cannot rewrite " << printReg(Reg, &TRI, 0, &MRI) + << ": no sub-reg split\n"); + return {}; + } + + VisitedVRegs.insert(Reg); + SmallSet UsedSubRegs; + for (MachineOperand &RegOp : MRI.reg_operands(Reg)) { + int SubReg = RegOp.getSubReg(); + if (SubReg && SubRegSplit.count(SubReg)) { + UsedSubRegs.insert(SubReg); + } else if (RegOp.getParent()->isFullCopy()) { + // To rewrite a full copy, both operands need to be rewritable using + // their subregs. + Register DstReg = RegOp.getParent()->getOperand(0).getReg(); + if (!VisitedVRegs.count(DstReg) && + getRewritableSubRegs(DstReg, MRI, TRI, VisitedVRegs).empty()) { + LLVM_DEBUG(dbgs() << " Cannot rewrite " + << printReg(DstReg, &TRI, 0, &MRI) << " in " + << *RegOp.getParent()); + return {}; + } + Register SrcReg = RegOp.getParent()->getOperand(1).getReg(); + if (!VisitedVRegs.count(SrcReg) && + getRewritableSubRegs(SrcReg, MRI, TRI, VisitedVRegs).empty()) { + LLVM_DEBUG(dbgs() << " Cannot rewrite " + << printReg(SrcReg, &TRI, 0, &MRI) << " in " + << *RegOp.getParent()); + return {}; + } + UsedSubRegs.insert(SubRegSplit.begin(), SubRegSplit.end()); + } else { + LLVM_DEBUG(dbgs() << " Cannot rewrite " << RegOp << " in " + << *RegOp.getParent()); + return {}; + } + } + + return UsedSubRegs; +} + +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI) { + std::set VisitedVRegs; + return getRewritableSubRegs(Reg, MRI, TRI, VisitedVRegs); +} + +/// Rewrite a full copy into multiple copies using the subregs in \p CopySubRegs +void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, + LiveIntervals &LIS, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM) { + assert(MI.isFullCopy()); + SlotIndex CopyIndex = LIS.getInstructionIndex(MI); + LLVM_DEBUG(dbgs() << " Changing full copy at " << CopyIndex << ": " << MI); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LaneBitmask LiveSrcLanes = getLiveLanesAt(CopyIndex, SrcReg, LIS); + + LIS.removeVRegDefAt(LIS.getInterval(DstReg), CopyIndex.getRegSlot()); + + SmallSet RegistersToRepair; + for (int SubRegIdx : CopySubRegs) { + if ((LiveSrcLanes & TRI.getSubRegIndexLaneMask(SubRegIdx)).none()) { + LLVM_DEBUG(dbgs() << " Skip undefined subreg " + << TRI.getSubRegIndexName(SubRegIdx) << "\n"); + continue; + } + + MachineInstr *PartCopy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII.get(TargetOpcode::COPY)) + .addReg(DstReg, RegState::Define, SubRegIdx) + .addReg(SrcReg, 0, SubRegIdx) + .getInstr(); + LLVM_DEBUG(dbgs() << " to " << *PartCopy); + LIS.InsertMachineInstrInMaps(*PartCopy); + LIS.getInterval(PartCopy->getOperand(0).getReg()); + RegistersToRepair.insert(PartCopy->getOperand(1).getReg()); + } + + LIS.RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + // As we don't handle all registers now (selective LI filter), + // We should make sure that all LiveIntervals are correct. + // If we don't repair, MI will compose the LIs of some registers, + // what is not correct because MI was deleted. + repairLiveIntervals(RegistersToRepair, VRM, LRM, LIS); +} + +/// Return a mask of all the lanes that are live at \p Index +LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, + const LiveIntervals &LIS) { + const LiveInterval &LI = LIS.getInterval(Reg); + if (!LI.hasSubRanges()) + return LaneBitmask::getAll(); + + LaneBitmask LiveLanes; + for (const LiveInterval::SubRange &SubLI : LI.subranges()) { + if (SubLI.liveAt(Index)) + LiveLanes |= SubLI.LaneMask; + } + return LiveLanes; +} + +void rewriteSuperReg(Register Reg, Register AssignedPhysReg, + SmallSet &SubRegs, MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM, LiveIntervals &LIS, + SlotIndexes &Indexes, LiveDebugVariables &DebugVars) { + LLVM_DEBUG(dbgs() << "Rewriting " << printReg(Reg, &TRI, 0, &MRI) << '\n'); + auto *TII = static_cast( + VRM.getMachineFunction().getSubtarget().getInstrInfo()); + + // Collect all the subreg indices to rewrite as independent vregs. + SmallMapVector SubRegToVReg; + const TargetRegisterClass *SuperRC = MRI.getRegClass(Reg); + assert(!SubRegs.empty()); + for (int SubReg : SubRegs) { + const TargetRegisterClass *SubRC = TRI.getSubRegisterClass(SuperRC, SubReg); + SubRegToVReg[SubReg] = MRI.createVirtualRegister(SubRC); + } + + // Rewrite full copies into multiple copies using subregs + for (MachineInstr &MI : make_early_inc_range(MRI.reg_instructions(Reg))) { + if (MI.isFullCopy()) + AIESuperRegUtils::rewriteFullCopy( + MI, TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()), LIS, *TII, TRI, + VRM, LRM); + } + + LLVM_DEBUG(dbgs() << " Splitting range " << LIS.getInterval(Reg) << "\n"); + for (MachineOperand &RegOp : make_early_inc_range(MRI.reg_operands(Reg))) { + LLVM_DEBUG(dbgs() << " Changing " << *RegOp.getParent()); + int SubReg = RegOp.getSubReg(); + assert(SubReg); + RegOp.setReg(SubRegToVReg[SubReg]); + RegOp.setSubReg(0); + + // There might have been a write-undefined due to only writing one sub-lane. + // Now that each sub-lane has its own VReg, the qualifier is invalid. + if (RegOp.isDef()) + RegOp.setIsUndef(false); + + // Make sure the right reg class is applied, some MIs might use compound + // classes with both 20 and 32 bits registers. + const TargetRegisterClass *OpRC = TII->getRegClass( + RegOp.getParent()->getDesc(), RegOp.getParent()->getOperandNo(&RegOp), + &TRI, VRM.getMachineFunction()); + MRI.constrainRegClass(SubRegToVReg[SubReg], OpRC); + + LLVM_DEBUG(dbgs() << " to " << *RegOp.getParent()); + } + + VRM.grow(); + LIS.removeInterval(Reg); + + for (auto &[SubRegIdx, VReg] : SubRegToVReg) { + MCRegister SubPhysReg = TRI.getSubReg(AssignedPhysReg, SubRegIdx); + LiveInterval &SubRegLI = LIS.getInterval(VReg); + LLVM_DEBUG(dbgs() << " Assigning Range: " << SubRegLI << '\n'); + + // By giving an independent VReg to each lane, we might have created + // multiple separate components. Give a VReg to each separate component. + SmallVector LIComponents; + LIS.splitSeparateComponents(SubRegLI, LIComponents); + LIComponents.push_back(&SubRegLI); + VRM.grow(); + + for (LiveInterval *LI : LIComponents) { + LRM.assign(*LI, SubPhysReg); + VRM.setRequiredPhys(LI->reg(), SubPhysReg); + LLVM_DEBUG(dbgs() << " Assigned " << printReg(LI->reg()) << "\n"); + } + } + + // Announce new VRegs so DBG locations can be updated. + auto NewVRegs = SmallVector(llvm::map_range( + SubRegToVReg, [&](auto &Mapping) { return Mapping.second; })); + DebugVars.splitRegister(Reg, NewVRegs, LIS); +} + +bool isRegUsedBy2DOr3DInstruction(const MachineRegisterInfo &MRI, + const Register &R) { + + return llvm::any_of( + MRI.use_nodbg_instructions(R), [&](const MachineInstr &MI) { + auto &TII = *static_cast( + MI.getMF()->getSubtarget().getInstrInfo()); + + // We should recognize both cases, with and without splitting. A 2D/3D + // instruction will always be split or splittable. + return TII.getOpcodeWithTupleOperands(MI.getOpcode()).has_value() || + TII.getOpcodeWithAtomicOperands(MI.getOpcode()).has_value(); + }); +} + +void repairLiveIntervals(SmallSet &RegistersToRepair, + VirtRegMap &VRM, LiveRegMatrix &LRM, + LiveIntervals &LIS) { + for (Register R : RegistersToRepair) { + + if (!LIS.hasInterval(R)) + continue; + + if (VRM.hasPhys(R)) { + const MCRegister PhysReg = VRM.getPhys(R); + const LiveInterval &OldLI = LIS.getInterval(R); + LRM.unassign(OldLI); + LIS.removeInterval(R); + const LiveInterval &LI = LIS.createAndComputeVirtRegInterval(R); + LRM.assign(LI, PhysReg); + } else { + LIS.removeInterval(R); + LIS.createAndComputeVirtRegInterval(R); + } + } +} + +} // namespace llvm::AIESuperRegUtils diff --git a/llvm/lib/Target/AIE/AIESuperRegUtils.h b/llvm/lib/Target/AIE/AIESuperRegUtils.h new file mode 100644 index 000000000000..54f698274f08 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESuperRegUtils.h @@ -0,0 +1,81 @@ +//===-- AIESuperRegUtils.h ------------------------------------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file contains helper functions to work with 2D/3D composite registers. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AIE_AIESUPERREGUTILS_H +#define LLVM_LIB_TARGET_AIE_AIESUPERREGUTILS_H + +#include "llvm/ADT/SmallSet.h" + +namespace llvm { +class Register; +class MachineRegisterInfo; +struct AIEBaseRegisterInfo; +class MachineInstr; +class LiveIntervals; +class TargetInstrInfo; +class TargetRegisterInfo; +struct LaneBitmask; +class SlotIndex; +class SlotIndexes; +class VirtRegMap; +class LiveRegMatrix; +class LiveDebugVariables; +} // namespace llvm + +namespace llvm::AIESuperRegUtils { + +/// Determines if a composite register can be safely decomposed into its +/// subregisters by analyzing all uses. A register is rewritable if all uses +/// either access specific subregisters or are full copies where both operands +/// are also rewritable. Returns the set of subregister indices that can be +/// used for rewriting, or an empty set if decomposition is not possible. +/// Physical registers and registers without subregister splits cannot be +/// rewritten. +/// +/// Returns the subreg indices that can be used to rewrite \p Reg into smaller +/// regs. Returns {} if the rewrite isn't possible. +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, + std::set &VisitedVRegs); + +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI); + +/// Rewrite a full copy into multiple copies using the subregs in \p CopySubRegs +void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, + LiveIntervals &LIS, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM); + +/// Return a mask of all the lanes that are live at \p Index +LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, + const LiveIntervals &LIS); + +void rewriteSuperReg(Register Reg, Register AssignedPhysReg, + SmallSet &SubRegs, MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM, LiveIntervals &LIS, + SlotIndexes &Indexes, LiveDebugVariables &DebugVars); + +bool isRegUsedBy2DOr3DInstruction(const MachineRegisterInfo &MRI, + const Register &R); + +void repairLiveIntervals(SmallSet &RegistersToRepair, + VirtRegMap &VRM, LiveRegMatrix &LRM, + LiveIntervals &LIS); + +} // namespace llvm::AIESuperRegUtils + +#endif diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index 1820b3473814..4bbc0011146f 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -124,6 +124,7 @@ add_llvm_target(AIECodeGen AIESubRegConstrainer.cpp AIESWPSolver.cpp AIESuperRegRewriter.cpp + AIESuperRegUtils.cpp AIETargetObjectFile.cpp AIE2AsmPrinter.cpp AIE2FrameLowering.cpp diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp index 357feb20b699..92c72d01135d 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp @@ -14,6 +14,7 @@ #include "AIE2PTargetMachine.h" #include "AIE2PTargetTransformInfo.h" +#include "AIESuperRegUtils.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -62,21 +63,6 @@ void AIE2PPassConfig::addPreRegBankSelect() { } } -static bool isRegUsedBy2DOr3DInstruction(const MachineRegisterInfo &MRI, - const Register &R) { - - return llvm::any_of( - MRI.use_nodbg_instructions(R), [&](const MachineInstr &MI) { - auto &TII = *static_cast( - MI.getMF()->getSubtarget().getInstrInfo()); - - // We should recognize both cases, with and without splitting. A 2D/3D - // instruction will always be split os splittable. - return TII.getOpcodeWithTupleOperands(MI.getOpcode()).has_value() || - TII.getOpcodeWithAtomicOperands(MI.getOpcode()).has_value(); - }); -} - static bool onlyAllocate3DRegisters(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register &R) { @@ -84,8 +70,9 @@ static bool onlyAllocate3DRegisters(const TargetRegisterInfo &TRI, const TargetRegisterClass *RegClass = MRI.getRegClass(R); if (!AIE2P::eDSRegClass.hasSubClassEq(RegClass)) return false; - return EnableFineGrainedStagedRA ? isRegUsedBy2DOr3DInstruction(MRI, R) - : true; + return EnableFineGrainedStagedRA + ? AIESuperRegUtils::isRegUsedBy2DOr3DInstruction(MRI, R) + : true; } static bool onlyAllocate3D2DRegisters(const TargetRegisterInfo &TRI, @@ -95,8 +82,9 @@ static bool onlyAllocate3D2DRegisters(const TargetRegisterInfo &TRI, if (!AIE2P::eDSRegClass.hasSubClassEq(RegClass) && !AIE2P::eDRegClass.hasSubClassEq(RegClass)) return false; - return EnableFineGrainedStagedRA ? isRegUsedBy2DOr3DInstruction(MRI, R) - : true; + return EnableFineGrainedStagedRA + ? AIESuperRegUtils::isRegUsedBy2DOr3DInstruction(MRI, R) + : true; } static bool onlyAllocateMRegisters(const TargetRegisterInfo &TRI, From 264a49109e42e338aa7f07951e5674b1fb1f2b6e Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Thu, 30 Oct 2025 04:52:36 -0600 Subject: [PATCH 4/9] [AIE2P] Add a base test to detect missing undef flags in copy expansion. The goal of this test is to check if we properly insert undef flag on the def side of a expanded full copy. On a sub-register def operand, it refers to the part of the register that isn't written. A sub-register def implicitly reads the other parts of the register being redefined unless the flag is set, and a missing flag can force the related register to be inserted in liveout set of the predecessors block, causing dominance problems. Co-Authored-By: Krishnam Tibrewala --- .../AIE/aie2p/ra/staged-ra-check-undef.mir | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir new file mode 100644 index 000000000000..ace286324372 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir @@ -0,0 +1,87 @@ + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# +# RUN: not --crash llc -O2 -mtriple=aie2p -start-before=greedy \ +# RUN: -stop-before=aie-unallocated-superreg-rewrite -o /dev/null %s 2>&1 | FileCheck %s + +# The goal of this test is to check if we properly insert undef flag on the def side +# of a expanded full copy. On a sub-register def operand, it refers to the part of the +# register that isn't written. A sub-register def implicitly reads the other parts of the +# register being redefined unless the flag is set, and a missing flag can +# force the related register to be inserted in liveout set of the predecessors block, +# causing dominance problems. + +# CHECK: LLVM ERROR: Found 1 machine code errors + +--- +name: use_all_2d_regs +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1(0x80000000) + + undef %80.sub_dim_stride:ed = MOV_PD_imm11_pseudo 1 + %80.sub_mod:ed = MOV_PD_imm11_pseudo 0 + undef %105.sub_dim_size:ed = MOV_PD_imm11_pseudo -1 + %105.sub_mod:ed = COPY %80.sub_dim_stride + %105.sub_dim_stride:ed = COPY %80.sub_mod + undef %101.sub_dim_size:ed = COPY %80.sub_mod + undef %97.sub_dim_size:ed = COPY %80.sub_mod + undef %90.sub_dim_size:ed = COPY %80.sub_mod + undef %86.sub_dim_size:ed = COPY %80.sub_mod + undef %82.sub_dim_size:ed = COPY %80.sub_mod + %80.sub_dim_size:ed = COPY %80.sub_mod + %105.sub_dim_count:ed = COPY %80.sub_mod + %97.sub_dim_count:ed = COPY %80.sub_mod + %90.sub_dim_count:ed = COPY %80.sub_mod + %101.sub_dim_count:ed = COPY %80.sub_mod + undef %94.sub_dim_count:ed = COPY %80.sub_mod + %86.sub_dim_count:ed = COPY %80.sub_mod + %82.sub_dim_count:ed = COPY %80.sub_mod + %80.sub_dim_count:ed = COPY %80.sub_mod + undef %77.sub_dim_count:ed = COPY %80.sub_mod + + bb.1: + successors: %bb.1(0x80000000) + + %10:ep = MOV_PD_imm11_pseudo 0 + %18:ep = MOV_PD_imm11_pseudo 0 + %22:ep = MOV_PD_imm11_pseudo 0 + %26:ep = MOV_PD_imm11_pseudo 0 + dead %10:ep, %105.sub_dim_count:ed = PADD_2D_pseudo_split %10, %105.sub_mod, %105.sub_dim_size, %105.sub_dim_stride, %105.sub_dim_count + %30:ep = MOV_PD_imm11_pseudo 0 + %101.sub_mod:ed = COPY %105.sub_mod + %101.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %18:ep, %101.sub_dim_count:ed = PADD_2D_pseudo_split %18, %101.sub_mod, %101.sub_dim_size, %101.sub_dim_stride, %101.sub_dim_count + %34:ep = MOV_PD_imm11_pseudo 0 + %97.sub_mod:ed = COPY %105.sub_mod + %97.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %22:ep, %97.sub_dim_count:ed = PADD_2D_pseudo_split %22, %97.sub_mod, %97.sub_dim_size, %97.sub_dim_stride, %97.sub_dim_count + %94.sub_mod:ed = COPY %105.sub_mod + %94.sub_dim_size:ed = COPY %105.sub_dim_size + %38:ep = MOV_PD_imm11_pseudo 0 + %94.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %26:ep, %94.sub_dim_count:ed = PADD_2D_pseudo_split %26, %94.sub_mod, %94.sub_dim_size, %94.sub_dim_stride, %94.sub_dim_count + %90.sub_mod:ed = COPY %105.sub_mod + %90.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %30:ep, %90.sub_dim_count:ed = PADD_2D_pseudo_split %30, %90.sub_mod, %90.sub_dim_size, %90.sub_dim_stride, %90.sub_dim_count + %42:ep = MOV_PD_imm11_pseudo 0 + %86.sub_mod:ed = COPY %105.sub_mod + %86.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %34:ep, %86.sub_dim_count:ed = PADD_2D_pseudo_split %34, %86.sub_mod, %86.sub_dim_size, %86.sub_dim_stride, %86.sub_dim_count + dead %42:ep, %80.sub_dim_count:ed = PADD_2D_pseudo_split %42, %80.sub_mod, %80.sub_dim_size, %80.sub_dim_stride, %80.sub_dim_count + %82.sub_mod:ed = COPY %105.sub_mod + %46:ep = MOV_PD_imm11_pseudo 0 + %82.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %38:ep, %82.sub_dim_count:ed = PADD_2D_pseudo_split %38, %82.sub_mod, %82.sub_dim_size, %82.sub_dim_stride, %82.sub_dim_count + %77.sub_dim_size:ed = COPY %80.sub_mod + %77.sub_mod:ed = COPY %80.sub_mod + %77.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %46:ep, %77.sub_dim_count:ed = PADD_2D_pseudo_split %46, %77.sub_mod, %77.sub_dim_size, %77.sub_dim_stride, %77.sub_dim_count + PseudoJ_jump_imm %bb.1 + +... From 99758131d56daf994d92fd241735d24f0ac9f359 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Fri, 31 Oct 2025 06:37:09 -0600 Subject: [PATCH 5/9] [AIEX] Add missing undef flags in copy expansion This will handle properly use of non-dominating definitions. We also change the handling of the destination registers in two parts: *Copy expansion: we replace the ogininal index by the index of the first lane copy to avoid the creation LRs with just one instruction, in this way we keep que LI correct. *Rewrite: reset dead flags if necessary. Co-Authored-By: Krishnam Tibrewala --- llvm/lib/Target/AIE/AIESuperRegUtils.cpp | 57 +++++++++++- .../AIE/aie2p/ra/staged-ra-check-undef.mir | 91 +++++++++++++++++-- 2 files changed, 137 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AIE/AIESuperRegUtils.cpp b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp index 25959825c147..7bf6011a7cf6 100644 --- a/llvm/lib/Target/AIE/AIESuperRegUtils.cpp +++ b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp @@ -103,8 +103,26 @@ void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, Register SrcReg = MI.getOperand(1).getReg(); LaneBitmask LiveSrcLanes = getLiveLanesAt(CopyIndex, SrcReg, LIS); - LIS.removeVRegDefAt(LIS.getInterval(DstReg), CopyIndex.getRegSlot()); + if (!VRM.hasPhys(DstReg)) { + // FIXME: This pass may cause verification failures. The fix should + // be in the MachineVerifier. This is a very uncommon case where the + // destination register was not allocated yet. + // The machine verifier does not properly handle the semantics of: + // 1. **Partial register definitions with `undefined`**: When the first + // subregister is defined with `undefined`, it doesn't expect subsequent + // definitions to implicitly read that lane. + // 2. **Lane-based liveness for composite registers**: The verifier expects + // a continuous live range for the entire register, but with subregister + // definitions, different lanes have different live ranges that are being + // built up incrementally. + // 3. **Implicit reads in partial definitions**: The verifier doesn't + // recognize that `%18.sub_dim_size:ed = COPY ...` implicitly reads the + // previously defined `%18.sub_dim_count` lane. + MI.getMF()->getProperties().set( + MachineFunctionProperties::Property::FailsVerification); + } + MachineInstr *FirstMI = nullptr; SmallSet RegistersToRepair; for (int SubRegIdx : CopySubRegs) { if ((LiveSrcLanes & TRI.getSubRegIndexLaneMask(SubRegIdx)).none()) { @@ -118,13 +136,31 @@ void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, .addReg(DstReg, RegState::Define, SubRegIdx) .addReg(SrcReg, 0, SubRegIdx) .getInstr(); + + // Only set undefined on the first partial copy. The first copy doesn't read + // other lanes, but subsequent copies do read the previously written lanes. + // Setting undefined on all copies breaks live interval tracking and causes + // machine verifier errors. + if (!FirstMI) { + PartCopy->getOperand(0).setIsUndef(); + FirstMI = PartCopy; + } LLVM_DEBUG(dbgs() << " to " << *PartCopy); LIS.InsertMachineInstrInMaps(*PartCopy); - LIS.getInterval(PartCopy->getOperand(0).getReg()); + // We need to repair only the Src register. For the Dst register, + // we don't need to do anything explicit, because we will replace the + // original copy by the first lane copy in LIS. We avoid the explicit repair + // of Dst reg because LIS will create a exclusive range for each copy, + // because it considers that every sub-lane copy will make the preceding + // one dead, what is not true for composite registers. + // TODO: investigate why subregister liveness is being ignored by LIS + // at this point. RegistersToRepair.insert(PartCopy->getOperand(1).getReg()); } - LIS.RemoveMachineInstrFromMaps(MI); + // Replace the original copy by the first one, so we automatically repair + // DstReg's LI. + LIS.ReplaceMachineInstrInMaps(MI, *FirstMI); MI.eraseFromParent(); // As we don't handle all registers now (selective LI filter), // We should make sure that all LiveIntervals are correct. @@ -184,8 +220,17 @@ void rewriteSuperReg(Register Reg, Register AssignedPhysReg, // There might have been a write-undefined due to only writing one sub-lane. // Now that each sub-lane has its own VReg, the qualifier is invalid. - if (RegOp.isDef()) + if (RegOp.isDef()) { RegOp.setIsUndef(false); + // Also unset correctly the dead flag if the instruction + // is not the dead slot in the live range (the def is still alive). + LiveInterval &LI = LIS.getInterval(Reg); + MachineInstr *DefMI = RegOp.getParent(); + SlotIndex Def = LIS.getInstructionIndex(*DefMI); + LiveRange::iterator I = LI.FindSegmentContaining(Def); + if (I->end != Def.getDeadSlot()) + RegOp.setIsDead(false); + } // Make sure the right reg class is applied, some MIs might use compound // classes with both 20 and 32 bits registers. @@ -259,6 +304,10 @@ void repairLiveIntervals(SmallSet &RegistersToRepair, LIS.removeInterval(R); LIS.createAndComputeVirtRegInterval(R); } + + // After recomputing, shrink the interval to remove any invalid segments + // This is important for registers with undefined definitions. + LIS.shrinkToUses(&LIS.getInterval(R)); } } diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir index ace286324372..945192cd39f0 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # This file is licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. @@ -5,8 +6,8 @@ # # (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates # -# RUN: not --crash llc -O2 -mtriple=aie2p -start-before=greedy \ -# RUN: -stop-before=aie-unallocated-superreg-rewrite -o /dev/null %s 2>&1 | FileCheck %s +# RUN: llc -O2 -mtriple=aie2p -start-before=greedy \ +# RUN: -stop-before=aie-unallocated-superreg-rewrite -verify-machineinstrs %s -o - | FileCheck %s # The goal of this test is to check if we properly insert undef flag on the def side # of a expanded full copy. On a sub-register def operand, it refers to the part of the @@ -15,15 +16,91 @@ # force the related register to be inserted in liveout set of the predecessors block, # causing dominance problems. -# CHECK: LLVM ERROR: Found 1 machine code errors - --- name: use_all_2d_regs tracksRegLiveness: true body: | + ; CHECK-LABEL: name: use_all_2d_regs + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_dim_stride:ed = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_mod:ed = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edn = MOV_PD_imm11_pseudo -1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:em = COPY [[MOV_PD_imm11_pseudo]].sub_dim_stride + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:edj = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_dim_size:ed = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_dim_count:ed = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:ed = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep, [[COPY7:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo2]], [[COPY]], [[MOV_PD_imm11_pseudo1]], [[COPY1]], [[COPY7]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep, [[COPY10:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo3]], [[COPY16]], [[COPY2]], [[COPY17]], [[COPY10]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo7:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep, [[COPY8:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo4]], [[COPY18]], [[COPY3]], [[COPY19]], [[COPY8]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo8:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo5:%[0-9]+]]:ep, [[COPY11:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo5]], [[COPY20]], [[COPY21]], [[COPY22]], [[COPY11]] + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo6:%[0-9]+]]:ep, [[COPY9:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo6]], [[COPY23]], [[COPY4]], [[COPY24]], [[COPY9]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo9:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo7:%[0-9]+]]:ep, [[COPY12:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo7]], [[COPY25]], [[COPY5]], [[COPY26]], [[COPY12]] + ; CHECK-NEXT: undef [[COPY27:%[0-9]+]].sub_dim_count:ed = COPY [[COPY10]] { + ; CHECK-NEXT: internal [[COPY27]].sub_dim_size:ed = COPY [[COPY2]] + ; CHECK-NEXT: } + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:edc = COPY [[COPY14]].sub_dim_count + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:edn = COPY [[COPY14]].sub_dim_size + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:edj = COPY [[COPY14]].sub_dim_stride + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:em = COPY [[COPY14]].sub_mod + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo9:%[0-9]+]]:ep, [[COPY28:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo9]], [[COPY31]], [[COPY29]], [[COPY30]], [[COPY28]] + ; CHECK-NEXT: [[COPY32:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo10:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY33:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo8:%[0-9]+]]:ep, [[COPY13:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo8]], [[COPY32]], [[COPY6]], [[COPY33]], [[COPY13]] + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:edn = COPY [[COPY31]] + ; CHECK-NEXT: undef [[COPY14:%[0-9]+]].sub_dim_count:ed = COPY [[COPY28]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_dim_size:ed = COPY [[COPY29]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_dim_stride:ed = COPY [[COPY30]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_mod:ed = COPY [[COPY31]] + ; CHECK-NEXT: [[COPY35:%[0-9]+]]:em = COPY [[COPY31]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edc = COPY [[COPY27]].sub_dim_count { + ; CHECK-NEXT: internal [[COPY2]]:edn = COPY [[COPY27]].sub_dim_size + ; CHECK-NEXT: } + ; CHECK-NEXT: [[COPY36:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo10:%[0-9]+]]:ep, [[COPY15:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo10]], [[COPY35]], [[COPY34]], [[COPY36]], [[COPY15]] + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 bb.0: successors: %bb.1(0x80000000) - + undef %80.sub_dim_stride:ed = MOV_PD_imm11_pseudo 1 %80.sub_mod:ed = MOV_PD_imm11_pseudo 0 undef %105.sub_dim_size:ed = MOV_PD_imm11_pseudo -1 @@ -44,10 +121,10 @@ body: | %82.sub_dim_count:ed = COPY %80.sub_mod %80.sub_dim_count:ed = COPY %80.sub_mod undef %77.sub_dim_count:ed = COPY %80.sub_mod - + bb.1: successors: %bb.1(0x80000000) - + %10:ep = MOV_PD_imm11_pseudo 0 %18:ep = MOV_PD_imm11_pseudo 0 %22:ep = MOV_PD_imm11_pseudo 0 From 1968bf5bdbc3b67dd23f01376f4d59c861a6d619 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Tue, 21 Oct 2025 03:17:49 -0600 Subject: [PATCH 6/9] [AIE2P] Add a base lit test with unallocated 2D/3D regs before main Greedy run Co-Authored-By: Krishnam Tibrewala --- .../aie2p/ra/staged-rewrite-unallocated.mir | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir new file mode 100644 index 000000000000..40d2ad43e090 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs -start-before=greedy \ +# RUN: -stop-before=virtregrewriter %s -o - | FileCheck %s + +# This test exposes some rewriting opportunities. Please note +# that the registers directly used by the 3d instruction should not touched +# because they already have physical registers assigned (are allocated). + +--- +name: rewrite_unallocated +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: rewrite_unallocated + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:erf2 = MOV_RLC_imm11_pseudo 0 + ; CHECK-NEXT: undef [[VBCST_32_:%[0-9]+]].sub_512_lo:vec1024 = VBCST_32 [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em_as_32bit = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:edjl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: [[VBCST_32_:%[0-9]+]].sub_512_hi:vec1024 = COPY [[VBCST_32_]].sub_512_lo + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:eldfiforeg = COPY [[VBCST_32_]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:eps = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:edcl = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:ednl = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edch = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:ednh = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:edjh = COPY [[COPY5]] + ; CHECK-NEXT: undef [[COPY14:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY14:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY14:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY14:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY7:%[0-9]+]]:edcl, dead [[COPY11:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY14]].sub_ptr, [[COPY14]].sub_fifo, [[COPY14]].sub_avail, [[COPY10]], [[COPY8]], [[COPY9]], [[COPY7]], undef %23:em_as_32bit, [[COPY12]], [[COPY13]], [[COPY11]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 + bb.0: + successors: %bb.1(0x80000000) + + %9:erf2 = MOV_RLC_imm11_pseudo 0 + undef %8.sub_512_lo:vec1024 = VBCST_32 %9 + undef %14.sub_mod:eds = MOV_PD_imm11_pseudo 0 + %14.sub_dim_stride:eds = COPY %14.sub_mod + %14.sub_dim_size:eds = COPY %14.sub_mod + %14.sub_dim_count:eds = COPY %14.sub_mod + %14.sub_hi_dim_then_sub_dim_size:eds = COPY %14.sub_mod + %14.sub_hi_dim_then_sub_dim_count:eds = COPY %14.sub_mod + %14.sub_hi_dim_then_sub_dim_stride:eds = COPY %14.sub_dim_stride + %8.sub_512_hi:vec1024 = COPY %8.sub_512_lo + %12:eldfiforeg = COPY %8 + %7:eps = MOV_PD_imm11_pseudo 0 + + bb.1: + successors: %bb.1(0x80000000) + + %23:eds = COPY %14 + undef %22.sub_ptr:epsrfldf = COPY %7 + %22.sub_fifo:epsrfldf = COPY %12 + %22.sub_avail:epsrfldf = COPY %9 + dead %13:vec576, dead %22.sub_ptr:epsrfldf, dead %22.sub_fifo:epsrfldf, dead %22.sub_avail:epsrfldf, dead %23.sub_dim_count:eds, dead %23.sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split %22.sub_ptr, %22.sub_fifo, %22.sub_avail, %23.sub_mod, %23.sub_dim_size, %23.sub_dim_stride, %23.sub_dim_count, undef %23.sub_hi_dim_then_sub_mod, %23.sub_hi_dim_then_sub_dim_size, %23.sub_hi_dim_then_sub_dim_stride, %23.sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + PseudoJ_jump_imm %bb.1 + +... From 3527e24ef3025f120bb13efd04ce076c23ea3478 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Fri, 10 Oct 2025 05:38:53 -0600 Subject: [PATCH 7/9] [AIEX] Add a Pass to expand unallocated 2D/3D into individual ones If we don't need a full register, we can expand to individual lanes. Co-Authored-By: Krishnam Tibrewala --- llvm/lib/Target/AIE/AIE.h | 3 + llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp | 1 + llvm/lib/Target/AIE/AIESuperRegUtils.cpp | 25 ++- llvm/lib/Target/AIE/AIESuperRegUtils.h | 3 +- .../AIE/AIEUnallocatedSuperRegRewriter.cpp | 171 ++++++++++++++++++ llvm/lib/Target/AIE/CMakeLists.txt | 1 + .../Target/AIE/aie2p/AIE2PTargetMachine.cpp | 2 + .../CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll | 2 + .../CodeGen/AIE/aie2p/ra/staged-ra-spill.mir | 81 ++++++--- .../aie2p/ra/staged-rewrite-unallocated.mir | 16 +- .../CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir | 12 +- llvm/test/CodeGen/AIE/staged-ra-rewrite.mir | 11 +- 12 files changed, 269 insertions(+), 59 deletions(-) create mode 100644 llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp diff --git a/llvm/lib/Target/AIE/AIE.h b/llvm/lib/Target/AIE/AIE.h index 622acd89b7ae..39138cf1b82a 100644 --- a/llvm/lib/Target/AIE/AIE.h +++ b/llvm/lib/Target/AIE/AIE.h @@ -60,6 +60,7 @@ MachineFunctionPass *createAIEEliminateDuplicatePHI(); FunctionPass *createAIEOutlineMemoryGEP(); FunctionPass *createAIESuperRegRewriter(); FunctionPass *createAIEWawRegRewriter(); +FunctionPass *createAIEUnallocatedSuperRegRewriter(); FunctionPass *createAIEPostSelectOptimize(); MachineFunctionPass * createDeadMachineInstructionElim(bool KeepLifetimeInstructions); @@ -84,6 +85,8 @@ extern char &AIESuperRegRewriterID; void initializeAIESuperRegRewriterPass(PassRegistry &); extern char &AIEWawRegRewriterID; void initializeAIEWawRegRewriterPass(PassRegistry &); +extern char &AIEUnallocatedSuperRegRewriterID; +void initializeAIEUnallocatedSuperRegRewriterPass(PassRegistry &); extern char &AIEOutlineMemoryGEPID; void initializeAIEOutlineMemoryGEPPass(PassRegistry &); diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp index ae472b53094e..0b572e3a14fc 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp @@ -158,6 +158,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAIETarget() { initializeAIEPseudoBranchExpansionPass(*PR); initializeAIESubRegConstrainerPass(*PR); initializeAIESuperRegRewriterPass(*PR); + initializeAIEUnallocatedSuperRegRewriterPass(*PR); initializeAIEWawRegRewriterPass(*PR); initializeAIEOutlineMemoryGEPPass(*PR); initializeAIEFinalizeBundlePass(*PR); diff --git a/llvm/lib/Target/AIE/AIESuperRegUtils.cpp b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp index 7bf6011a7cf6..9ae12b80e329 100644 --- a/llvm/lib/Target/AIE/AIESuperRegUtils.cpp +++ b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp @@ -184,21 +184,26 @@ LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, return LiveLanes; } -void rewriteSuperReg(Register Reg, Register AssignedPhysReg, +void rewriteSuperReg(Register Reg, std::optional AssignedPhysReg, SmallSet &SubRegs, MachineRegisterInfo &MRI, const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, SlotIndexes &Indexes, LiveDebugVariables &DebugVars) { LLVM_DEBUG(dbgs() << "Rewriting " << printReg(Reg, &TRI, 0, &MRI) << '\n'); - auto *TII = static_cast( - VRM.getMachineFunction().getSubtarget().getInstrInfo()); + MachineFunction &MF = VRM.getMachineFunction(); + auto *TII = + static_cast(MF.getSubtarget().getInstrInfo()); // Collect all the subreg indices to rewrite as independent vregs. SmallMapVector SubRegToVReg; const TargetRegisterClass *SuperRC = MRI.getRegClass(Reg); assert(!SubRegs.empty()); for (int SubReg : SubRegs) { - const TargetRegisterClass *SubRC = TRI.getSubRegisterClass(SuperRC, SubReg); + const TargetRegisterClass *SubRC = + AssignedPhysReg.has_value() + ? TRI.getSubRegisterClass(SuperRC, SubReg) + : TRI.getLargestLegalSuperClass( + TRI.getSubRegisterClass(SuperRC, SubReg), MF); SubRegToVReg[SubReg] = MRI.createVirtualRegister(SubRC); } @@ -246,7 +251,6 @@ void rewriteSuperReg(Register Reg, Register AssignedPhysReg, LIS.removeInterval(Reg); for (auto &[SubRegIdx, VReg] : SubRegToVReg) { - MCRegister SubPhysReg = TRI.getSubReg(AssignedPhysReg, SubRegIdx); LiveInterval &SubRegLI = LIS.getInterval(VReg); LLVM_DEBUG(dbgs() << " Assigning Range: " << SubRegLI << '\n'); @@ -257,10 +261,13 @@ void rewriteSuperReg(Register Reg, Register AssignedPhysReg, LIComponents.push_back(&SubRegLI); VRM.grow(); - for (LiveInterval *LI : LIComponents) { - LRM.assign(*LI, SubPhysReg); - VRM.setRequiredPhys(LI->reg(), SubPhysReg); - LLVM_DEBUG(dbgs() << " Assigned " << printReg(LI->reg()) << "\n"); + if (AssignedPhysReg.has_value()) { + MCRegister SubPhysReg = TRI.getSubReg(*AssignedPhysReg, SubRegIdx); + for (LiveInterval *LI : LIComponents) { + LRM.assign(*LI, SubPhysReg); + VRM.setRequiredPhys(LI->reg(), SubPhysReg); + LLVM_DEBUG(dbgs() << " Assigned " << printReg(LI->reg()) << "\n"); + } } } diff --git a/llvm/lib/Target/AIE/AIESuperRegUtils.h b/llvm/lib/Target/AIE/AIESuperRegUtils.h index 54f698274f08..d2e8f2ab9b7e 100644 --- a/llvm/lib/Target/AIE/AIESuperRegUtils.h +++ b/llvm/lib/Target/AIE/AIESuperRegUtils.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AIE_AIESUPERREGUTILS_H #include "llvm/ADT/SmallSet.h" +#include namespace llvm { class Register; @@ -63,7 +64,7 @@ void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, const LiveIntervals &LIS); -void rewriteSuperReg(Register Reg, Register AssignedPhysReg, +void rewriteSuperReg(Register Reg, std::optional AssignedPhysReg, SmallSet &SubRegs, MachineRegisterInfo &MRI, const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, diff --git a/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp b/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp new file mode 100644 index 000000000000..e70126327f37 --- /dev/null +++ b/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp @@ -0,0 +1,171 @@ +//===-- AIEUnallocatedSuperRegRewriter.cpp - Constrain tied sub-registers -===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "AIEBaseInstrInfo.h" +#include "AIEBaseRegisterInfo.h" +#include "AIESuperRegUtils.h" + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/LiveDebugVariables.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/LiveStacks.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aie-ra-prepare" + +namespace { + +using RegRewriteInfo = std::vector>>; + +/// Split large unallocated compound registers into multiple new smaller vregs +/// Than can be allocated to scalar registers. +class AIEUnallocatedSuperRegRewriter : public MachineFunctionPass { + +public: + static char ID; + AIEUnallocatedSuperRegRewriter() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; + +/// Identify unallocated virtual registers that can be split into subregisters. +/// Returns a list of candidate registers with their rewritable subregister +/// indices, excluding unused registers and those already assigned to physical +/// registers. +static RegRewriteInfo getRewriteCandidates(MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, + VirtRegMap &VRM) { + RegRewriteInfo RegistersToRewrite; + for (unsigned VRegIdx = 0, End = MRI.getNumVirtRegs(); VRegIdx != End; + ++VRegIdx) { + const Register Reg = Register::index2VirtReg(VRegIdx); + + // Ignore un-used registers + if (MRI.reg_nodbg_empty(Reg) || VRM.hasPhys(Reg)) + continue; + + SmallSet RewritableSubRegs = + AIESuperRegUtils::getRewritableSubRegs(Reg, MRI, TRI); + + if (RewritableSubRegs.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Candidate " << printReg(Reg, &TRI, 0, &MRI) << ":" + << printRegClassOrBank(Reg, MRI, &TRI) << '\n'); + + RegistersToRewrite.push_back({Reg, RewritableSubRegs}); + } + + LLVM_DEBUG(dbgs() << "Found " << RegistersToRewrite.size() + << " candidate register(s) for rewriting\n"); + + return RegistersToRewrite; +} + +/// Split candidate registers into independent virtual registers for each +/// subregister. Each composite register is rewritten using its subregister +/// indices, with live intervals and debug information updated accordingly. +void rewriteCandidates(RegRewriteInfo &RegistersToRewrite, + MachineRegisterInfo &MRI, const AIEBaseRegisterInfo &TRI, + VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, + SlotIndexes &Indexes, LiveDebugVariables &DebugVars) { + + LLVM_DEBUG(dbgs() << "Rewriting " << RegistersToRewrite.size() + << " candidate register(s)\n"); + + for (auto [VReg, SubRegs] : RegistersToRewrite) { + LLVM_DEBUG(dbgs() << " Rewriting " << printReg(VReg, &TRI, 0, &MRI) + << " into " << SubRegs.size() << " subregister(s)\n"); + AIESuperRegUtils::rewriteSuperReg( + VReg, /*std::optional AssignedPhysReg = */ {}, SubRegs, MRI, + TRI, VRM, LRM, LIS, Indexes, DebugVars); + } +} + +bool AIEUnallocatedSuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(llvm::dbgs() << "*** Splitting unallocated super-registers: " + << MF.getName() << " ***\n"); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + VirtRegMap &VRM = getAnalysis().getVRM(); + LiveRegMatrix &LRM = getAnalysis().getLRM(); + LiveIntervals &LIS = getAnalysis().getLIS(); + SlotIndexes &Indexes = getAnalysis().getSI(); + LiveDebugVariables &DebugVars = + getAnalysis().getLDV(); + auto &TRI = + *static_cast(MRI.getTargetRegisterInfo()); + + LLVM_DEBUG(dbgs() << "Identifying rewrite candidates...\n"); + RegRewriteInfo RegistersToRewrite = getRewriteCandidates(MRI, TRI, VRM); + + if (RegistersToRewrite.empty()) { + LLVM_DEBUG(dbgs() << "No candidates found, skipping rewrite\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Performing register rewrites...\n"); + rewriteCandidates(RegistersToRewrite, MRI, TRI, VRM, LRM, LIS, Indexes, + DebugVars); + + LLVM_DEBUG(dbgs() << "Successfully rewrote " << RegistersToRewrite.size() + << " register(s)\n"); + + return !RegistersToRewrite.empty(); +} + +} // end anonymous namespace + +char AIEUnallocatedSuperRegRewriter::ID = 0; +char &llvm::AIEUnallocatedSuperRegRewriterID = + AIEUnallocatedSuperRegRewriter::ID; + +INITIALIZE_PASS(AIEUnallocatedSuperRegRewriter, + "aie-unallocated-superreg-rewrite", + "AIE unallocated super-reg rewrite", false, false) + +llvm::FunctionPass *llvm::createAIEUnallocatedSuperRegRewriter() { + return new AIEUnallocatedSuperRegRewriter(); +} diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index 4bbc0011146f..5e5f42d9e4ab 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -141,6 +141,7 @@ add_llvm_target(AIECodeGen AIE2TargetMachine.cpp AIE2TargetTransformInfo.cpp AIETiedRegOperands.cpp + AIEUnallocatedSuperRegRewriter.cpp ReservedRegsLICM.cpp AIEOutlineMemoryGEP.cpp AIEWawRegRewriter.cpp diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp index 92c72d01135d..77bc2db140e5 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp @@ -114,6 +114,8 @@ bool AIE2PPassConfig::addRegAssignAndRewriteOptimized() { addPass(createAIESuperRegRewriter()); addPass(createGreedyRegisterAllocator(onlyAllocate3D2DRegisters)); addPass(createAIESuperRegRewriter()); + if (EnableFineGrainedStagedRA) + addPass(createAIEUnallocatedSuperRegRewriter()); } addPass(createGreedyRegisterAllocator()); if (EnableWAWRegRewrite) { diff --git a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll index f24f03290521..f612a96d2362 100644 --- a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll +++ b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll @@ -247,6 +247,7 @@ ; AIE-O1-NEXT: AIE super-reg rewrite ; AIE-O1-NEXT: Greedy Register Allocator ; AIE-O1-NEXT: AIE super-reg rewrite +; AIE-O1-NEXT: AIE unallocated super-reg rewrite ; AIE-O1-NEXT: Greedy Register Allocator ; AIE-O1-NEXT: AIE waw-reg rewrite ; AIE-O1-NEXT: Greedy Register Allocator @@ -472,6 +473,7 @@ ; AIE-O23-NEXT: AIE super-reg rewrite ; AIE-O23-NEXT: Greedy Register Allocator ; AIE-O23-NEXT: AIE super-reg rewrite +; AIE-O23-NEXT: AIE unallocated super-reg rewrite ; AIE-O23-NEXT: Greedy Register Allocator ; AIE-O23-NEXT: AIE waw-reg rewrite ; AIE-O23-NEXT: Greedy Register Allocator diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir index b40dbeee2ddd..5339cba35601 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir @@ -6,42 +6,68 @@ # # (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs --aie-staged-ra -start-before=greedy -aie-staged-ra-fine-grained-alloc=false \ +# RUN: -stop-after=virtregrewriter %s -o - | FileCheck %s --check-prefix=RA-STAGED # RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs --aie-staged-ra -start-before=greedy -stop-after=virtregrewriter %s -o - \ -# RUN: | FileCheck %s --check-prefix=RA +# RUN: | FileCheck %s --check-prefix=RA-STAGEG-FG -# Test what happens the 2D allocation stage needs to spill, and then the +# Test what happens the 2D allocation stage needs to spill, and then the # last allocation stage needs to spill again to make space for allocating -# %7:edj = MOV_PD_imm10_pseudo 12. +# %7:edj = MOV_PD_imm10_pseudo 12. Please note that in RA-STAGEG-FG +# (FG = fine grained) we can avoid spills by using scalar registers. --- name: test_spill_2d_last_stage tracksRegLiveness: true body: | bb.1.entry: liveins: $p0, $p1, $d1, $d2, $d3, $d4, $d5, $d6, $d7 - ; RA-LABEL: name: test_spill_2d_last_stage - ; RA: liveins: $d1, $d2, $d3, $d4, $d5, $d6, $d7, $p0, $p1 - ; RA-NEXT: {{ $}} - ; RA-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 0 - ; RA-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 4 - ; RA-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 8 - ; RA-NEXT: ST_D_SPILL renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) - ; RA-NEXT: renamable $dj0 = MOV_PD_imm11_pseudo 12 - ; RA-NEXT: renamable $r0 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) - ; RA-NEXT: renamable $dc0 = COPY killed renamable $r0 - ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) - ; RA-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 16 - ; RA-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 20 - ; RA-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 24 - ; RA-NEXT: renamable $dc0 = LDA_dms_lda_idx_imm killed renamable $p1, 28 - ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) - ; RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 - ; RA-NEXT: ST_D_SPILL renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) - ; RA-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) - ; RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 + ; RA-STAGED-LABEL: name: test_spill_2d_last_stage + ; RA-STAGED: liveins: $d1, $d2, $d3, $d4, $d5, $d6, $d7, $p0, $p1 + ; RA-STAGED-NEXT: {{ $}} + ; RA-STAGED-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 0 + ; RA-STAGED-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 4 + ; RA-STAGED-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 8 + ; RA-STAGED-NEXT: ST_D_SPILL renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $dj0 = MOV_PD_imm11_pseudo 12 + ; RA-STAGED-NEXT: renamable $r0 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $dc0 = COPY killed renamable $r0 + ; RA-STAGED-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 16 + ; RA-STAGED-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 20 + ; RA-STAGED-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 24 + ; RA-STAGED-NEXT: renamable $dc0 = LDA_dms_lda_idx_imm killed renamable $p1, 28 + ; RA-STAGED-NEXT: ST_D_SPILL killed renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-STAGED-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 + ; RA-STAGED-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) + ; RA-STAGED-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-STAGED-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 + ; + ; RA-STAGEG-FG-LABEL: name: test_spill_2d_last_stage + ; RA-STAGEG-FG: liveins: $d1, $d2, $d3, $d4, $d5, $d6, $d7, $p0, $p1 + ; RA-STAGEG-FG-NEXT: {{ $}} + ; RA-STAGEG-FG-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 0 + ; RA-STAGEG-FG-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 4 + ; RA-STAGEG-FG-NEXT: renamable $r0 = LDA_dms_lda_idx_imm renamable $p1, 8 + ; RA-STAGEG-FG-NEXT: renamable $dj0 = MOV_PD_imm11_pseudo 12 + ; RA-STAGEG-FG-NEXT: renamable $r1 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 + ; RA-STAGEG-FG-NEXT: renamable $dc0 = COPY killed renamable $r1 + ; RA-STAGEG-FG-NEXT: renamable $dj0 = COPY killed renamable $r0 + ; RA-STAGEG-FG-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 16 + ; RA-STAGEG-FG-NEXT: renamable $r3 = LDA_dms_lda_idx_imm renamable $p1, 20 + ; RA-STAGEG-FG-NEXT: renamable $r2 = LDA_dms_lda_idx_imm renamable $p1, 24 + ; RA-STAGEG-FG-NEXT: renamable $r0 = LDA_dms_lda_idx_imm killed renamable $p1, 28 + ; RA-STAGEG-FG-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; RA-STAGEG-FG-NEXT: renamable $r4 = COPY killed renamable $dc0 + ; RA-STAGEG-FG-NEXT: renamable $dc0 = COPY killed renamable $r0 + ; RA-STAGEG-FG-NEXT: renamable $dn0 = COPY killed renamable $r1 + ; RA-STAGEG-FG-NEXT: renamable $dj0 = COPY killed renamable $r2 + ; RA-STAGEG-FG-NEXT: renamable $m0 = COPY killed renamable $r3 + ; RA-STAGEG-FG-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; RA-STAGEG-FG-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $r4, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 %20:ep = COPY $p0 %21:ep = COPY $p1 undef %100.sub_dim_size:ed = LDA_dms_lda_idx_imm %21, 0 @@ -58,3 +84,4 @@ body: | %20:ep, %101.sub_dim_count:ed = PADDA_2D_split %20, %101.sub_mod, %101.sub_dim_size, %101.sub_dim_stride, %101.sub_dim_count PseudoRET implicit $lr, implicit %20, implicit %100.sub_dim_count, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 ... + diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir index 40d2ad43e090..f536c520d918 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir @@ -24,13 +24,13 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:erf2 = MOV_RLC_imm11_pseudo 0 ; CHECK-NEXT: undef [[VBCST_32_:%[0-9]+]].sub_512_lo:vec1024 = VBCST_32 [[MOV_RLC_imm11_pseudo]] - ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em_as_32bit = MOV_PD_imm11_pseudo 0 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:edjl = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:magusrc_and_magudst_and_spill_em_to_er = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:spill_edj_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:spill_edn_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:spill_edc_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:spill_edn_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:spill_edc_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:spill_edj_to_er = COPY [[COPY]] ; CHECK-NEXT: [[VBCST_32_:%[0-9]+]].sub_512_hi:vec1024 = COPY [[VBCST_32_]].sub_512_lo ; CHECK-NEXT: [[COPY6:%[0-9]+]]:eldfiforeg = COPY [[VBCST_32_]] ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:eps = MOV_PD_imm11_pseudo 0 @@ -48,7 +48,7 @@ body: | ; CHECK-NEXT: undef [[COPY14:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] - ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY14:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY14:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY14:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY7:%[0-9]+]]:edcl, dead [[COPY11:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY14]].sub_ptr, [[COPY14]].sub_fifo, [[COPY14]].sub_avail, [[COPY10]], [[COPY8]], [[COPY9]], [[COPY7]], undef %23:em_as_32bit, [[COPY12]], [[COPY13]], [[COPY11]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY14:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY14:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY14:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY7:%[0-9]+]]:edcl, dead [[COPY11:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY14]].sub_ptr, [[COPY14]].sub_fifo, [[COPY14]].sub_avail, [[COPY10]], [[COPY8]], [[COPY9]], [[COPY7]], undef %15:em_as_32bit, [[COPY12]], [[COPY13]], [[COPY11]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 bb.0: successors: %bb.1(0x80000000) diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir b/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir index a309aeb94dc9..ba8bbb824b51 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -O2 -mtriple=aie2p --issue-limit=1 --aie-bottomup-cycles=0 -verify-machineinstrs \ # RUN: -start-before=phi-node-elimination -stop-before=aie-finalize-mi-bundles \ # RUN: %s -o - | FileCheck %s @@ -181,25 +181,21 @@ body: | ; CHECK-LABEL: name: test_4_padd_scarce ; CHECK: liveins: $d2, $d3, $d4, $d5, $d6, $d7, $m0, $p0, $p1, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: frame-setup PADDXM_pstm_sp_imm 64, implicit-def $sp, implicit $sp - ; CHECK-NEXT: $m1 = MOV_alu_mv_mv_mv_scl killed $r4 - ; CHECK-NEXT: ST_dms_sts_spill killed $m1, -64, implicit $sp :: (store (s32) into %stack.0) ; CHECK-NEXT: $dc1 = MOV_alu_mv_mv_mv_scl $r3 ; CHECK-NEXT: $dn1 = MOV_alu_mv_mv_mv_scl $r1 ; CHECK-NEXT: $dj1 = MOV_alu_mv_mv_mv_scl $r2 ; CHECK-NEXT: $m1 = MOV_alu_mv_mv_mv_scl $r0 - ; CHECK-NEXT: $p0, dead $dc1 = PADDA_2D killed $p0, killed $d1 - ; CHECK-NEXT: $m1 = LDA_dms_lda_spill -64, implicit $sp :: (load (s32) from %stack.0) ; CHECK-NEXT: $m7 = MOV_alu_mv_mv_mv_scl killed $r0 ; CHECK-NEXT: $dn7 = MOV_alu_mv_mv_mv_scl killed $r1 ; CHECK-NEXT: $dj7 = MOV_alu_mv_mv_mv_scl killed $r2 ; CHECK-NEXT: $dc7 = MOV_alu_mv_mv_mv_scl killed $r3 + ; CHECK-NEXT: $p0, dead $dc1 = PADDA_2D killed $p0, killed $d1 ; CHECK-NEXT: $p2 = MOV_alu_mv_mv_mv_scl $p1 + ; CHECK-NEXT: $m1 = MOV_alu_mv_mv_mv_scl killed $r4 ; CHECK-NEXT: $dn1 = MOV_alu_mv_mv_mv_scl killed $r5 - ; CHECK-NEXT: dead $p2, $dc7 = PADDA_2D killed $p2, $d7 ; CHECK-NEXT: $dj1 = MOV_alu_mv_mv_mv_scl killed $r6 ; CHECK-NEXT: RET implicit $lr - ; CHECK-NEXT: frame-destroy PADDXM_pstm_sp_imm -64, implicit-def $sp, implicit $sp + ; CHECK-NEXT: dead $p2, $dc7 = PADDA_2D killed $p2, $d7 ; CHECK-NEXT: $dc1 = MOV_alu_mv_mv_mv_scl killed $r7 ; CHECK-NEXT: $p0, dead $dc7 = PADDA_2D killed $p0, killed $d7 ; CHECK-NEXT: $p1, dead $dc1 = PADDA_2D killed $p1, killed $d1 diff --git a/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir b/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir index 9f8898f794f9..c8d210a3b55f 100644 --- a/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir +++ b/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir @@ -442,14 +442,13 @@ body: | ; AIE2P-RA-NEXT: successors: %bb.1(0x80000000) ; AIE2P-RA-NEXT: liveins: $dj1, $dn1, $m1, $p0 ; AIE2P-RA-NEXT: {{ $}} - ; AIE2P-RA-NEXT: renamable $dc1 = MOV_PD_imm11_pseudo 0 + ; AIE2P-RA-NEXT: renamable $dc0 = MOV_PD_imm11_pseudo 0 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: bb.1: - ; AIE2P-RA-NEXT: liveins: $d1:0x0000000000200E00, $p0 + ; AIE2P-RA-NEXT: liveins: $dc0, $dj1, $dn1, $m1, $p0 ; AIE2P-RA-NEXT: {{ $}} - ; AIE2P-RA-NEXT: renamable $dc0 = COPY renamable $dc1 - ; AIE2P-RA-NEXT: renamable $dn0 = COPY renamable $dn1 - ; AIE2P-RA-NEXT: renamable $dj0 = COPY renamable $dj1 + ; AIE2P-RA-NEXT: renamable $dn0 = COPY killed renamable $dn1 + ; AIE2P-RA-NEXT: renamable $dj0 = COPY killed renamable $dj1 ; AIE2P-RA-NEXT: renamable $m0 = COPY killed renamable $m1 ; AIE2P-RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0 @@ -628,7 +627,7 @@ body: | ; AIE2P-RA-NEXT: renamable $dc0 = LDA_dms_lda_idx_imm killed renamable $p1, 12 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: bb.1: - ; AIE2P-RA-NEXT: liveins: $d0:0x0000000000000E00, $p0 + ; AIE2P-RA-NEXT: liveins: $dc0, $dj0, $dn0, $p0 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, undef $m0, killed $dn0, killed $dj0, killed $dc0 ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0 From fbaf15525e52a4512dda5b70450f0cf7f7cf455f Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Tue, 14 Oct 2025 08:30:00 -0600 Subject: [PATCH 8/9] [AIE2P] Add base tests exposing cycles in copy bundles Co-Authored-By: Krishnam Tibrewala --- .../AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll | 272 ++++++++++++++++++ .../ra/staged-rewrite-expand-copy-bundle.mir | 169 +++++++++++ 2 files changed, 441 insertions(+) create mode 100644 llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll create mode 100644 llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll new file mode 100644 index 000000000000..22094118685d --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll @@ -0,0 +1,272 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +; RUN: not llc -mtriple aie2p -o %t.s %s 2>&1 | FileCheck %s --check-prefix=BUNDLE-ERROR +; RUN: llc -mtriple=aie2p --aie-staged-ra-fine-grained-alloc=false %s -o - | FileCheck %s --check-prefix=COARSE-GRAINED + +; Function Attrs: nounwind readnone +; BUNDLE-ERROR: error: register rewriting failed: cycle in copy bundle +define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0.copyload.i, i32 %dimsAI.sroa.9.0.copyload.i, i32 %dimsAO.sroa.7.0.copyload.i, i32 %dimsAO.sroa.4.0.copyload.i, i32 %dimsAO.sroa.6.0.copyload.i, i32 %dimsAO.sroa.0.0.copyload.i, i32 %dimsAO.sroa.5.0.copyload.i, i32 %dimsW.sroa.4.0.copyload.i, i32 %dimsW.sroa.6.0.copyload.i, i20 %0, i1 %1, i32 %dimsAI.sroa.11.0.copyload.i) { +; COARSE-GRAINED-LABEL: heavy_3d_user: +; COARSE-GRAINED: // %bb.0: // %entry +; COARSE-GRAINED-NEXT: nopa ; nopb ; paddxm [sp], #384; nops +; COARSE-GRAINED-NEXT: mova m0, #-388; st r9, [sp, #-356]; mov p1, sp // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova m0, #-392; paddb [p1], m0; st r10, [sp, #-360] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj0, [p1, #0]; st r11, [sp, #-364]; mov p1, sp // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova m0, #-400; paddb [p1], m0; st r12, [sp, #-368] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj4, [p1, #0]; st r13, [sp, #-372]; mov p1, sp // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: padda [p1], m0; st r14, [sp, #-376] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m0, [p1, #0]; st r15, [sp, #-380] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st p6, [sp, #-384] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova r16, #0; st lr, [sp, #-348] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r8, [sp, #-352]; vbcst.32 x0, r16 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r0, [sp, #-248]; mov p6, p0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: vst x0, [sp, #-128]; mov p1, sp // 64-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-304] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova m0, #-396; st m0, [sp, #-280] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: padda [p1], m0; st dj0, [sp, #-272]; vmov x1, x0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda r8, [p1, #0]; st dj0, [sp, #-336]; mov p3, #0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: vst x1, [sp, #-64]; jl p3 // 64-byte Folded Spill +; COARSE-GRAINED-NEXT: mova p2, #0; st dj4, [sp, #-288] // 4-byte Folded Spill Delay Slot 5 +; COARSE-GRAINED-NEXT: mova dj4, #1; st dj4, [sp, #-256]; mov r9, r1 // 4-byte Folded Spill Delay Slot 4 +; COARSE-GRAINED-NEXT: mova m0, #0; st dj4, [sp, #-320]; or r10, r2, r2; mov r11, r3 // 4-byte Folded Spill Delay Slot 3 +; COARSE-GRAINED-NEXT: mova p0, #0; st m0, [sp, #-344]; or r12, r4, r4; mov r13, r5 // 4-byte Folded Spill Delay Slot 2 +; COARSE-GRAINED-NEXT: mova p1, #0; or r14, r6, r6; mov r15, r7 // Delay Slot 1 +; COARSE-GRAINED-NEXT: lda m1, [sp, #-344]; nopb ; nopxm // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj5, [sp, #-320] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m4, [sp, #-296]; mov dn4, r15 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: st dn4, [sp, #-260]; mov dj0, r12 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-272]; mov dn0, r14 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova dc3, #0; st dn0, [sp, #-276]; mov m0, r11 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m3, [sp, #-280]; movs dj4, r13; mov dc7, dc3 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m0, [sp, #-312]; st m0, [sp, #-280] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-288]; st dj4, [sp, #-256] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m5, [sp, #-328]; movs dj6, dj5; mov m2, m1 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dn0, [sp, #-308]; movs dn3, m1; mov m1, dj5 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj0, [sp, #-304]; st m4, [sp, #-296] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dn4, [sp, #-292]; st m4, [sp, #-328] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dc0, m2; mov dc6, m2 +; COARSE-GRAINED-NEXT: st m0, [sp, #-312] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj4, [sp, #-288] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs m0, m2; mov dc4, m2 +; COARSE-GRAINED-NEXT: st dn0, [sp, #-308] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-304] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj3, [sp, #-248]; st dn4, [sp, #-292] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m2, [sp, #-248] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj6, [sp, #-224] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dn0, [sp, #-340] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-336] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dn4, [sp, #-324] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc4, [sp, #-252] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: vlda x2, [sp, #-128]; movs dj4, dj5; mov dc4, dj5 // 64-byte Folded Reload +; COARSE-GRAINED-NEXT: vlda x3, [sp, #-64]; st dc0, [sp, #-268] // 64-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc0, [sp, #-300] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc6, [sp, #-220] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m0, [sp, #-344] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc0, [sp, #-332]; mov dn7, r9 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj4, [sp, #-320]; mov dj7, r10 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc4, [sp, #-284]; vmov lfl0, x2 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m7, [sp, #-264]; st dc4, [sp, #-316]; movx r0, #1; vmov lfh0, x3 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova r3, #0; movs dc5, m2; and r1, r8, r0; mov dc1, m2 +; COARSE-GRAINED-NEXT: .LBB0_1: // %for.body.i +; COARSE-GRAINED-NEXT: // =>This Loop Header: Depth=1 +; COARSE-GRAINED-NEXT: // Child Loop BB0_2 Depth 2 +; COARSE-GRAINED-NEXT: lda m0, [sp, #-344]; nopb ; nopx // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dc0, [sp, #-332] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-320] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: nop +; COARSE-GRAINED-NEXT: lda dn1, [sp, #-244]; movs dj1, p6; mov dn1, dn3 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: movs dn5, dn3; mov m2, m1 +; COARSE-GRAINED-NEXT: lda dn5, [sp, #-228]; movs dj5, p6; mov dc6, dc5 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: mova p1, #0; st m2, [sp, #-216]; mov r25, r3 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d1]; st dc6, [sp, #-188] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dc1, dc0; mov dj1, m0 +; COARSE-GRAINED-NEXT: movs m1, m0; mov dj5, dj4 +; COARSE-GRAINED-NEXT: st dn1, [sp, #-340]; vmov lfl1, lfl0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m5, [sp, #-232]; st dc1, [sp, #-332]; vmov lfh1, lfh0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc5, [sp, #-220]; movs dn1, dn3; mov dc1, dc3 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: st dn5, [sp, #-324] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj5, [sp, #-320] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dn5, dn3; mov dj5, m0 +; COARSE-GRAINED-NEXT: st m1, [sp, #-344] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj1, [sp, #-336] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m5, [sp, #-328] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc5, [sp, #-316] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m1, [sp, #-248] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj1, [sp, #-240] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m5, [sp, #-232] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dn1, [sp, #-244] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova p0, #0; st dn5, [sp, #-228] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: paddb.3d [p0], d1; st dj5, [sp, #-224] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc1, [sp, #-236] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova p0, #0; st dc5, [sp, #-220] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: .LBB0_2: // %for.body125.i +; COARSE-GRAINED-NEXT: // Parent Loop BB0_1 Depth=1 +; COARSE-GRAINED-NEXT: // => This Inner Loop Header: Depth=2 +; COARSE-GRAINED-NEXT: nops ; mov dn1, dn3 +; COARSE-GRAINED-NEXT: movs m1, m3; mov dj1, dj3 +; COARSE-GRAINED-NEXT: movs dc1, dc3; mov dn5, dn7 +; COARSE-GRAINED-NEXT: movs m5, m7; mov dc5, dc7 +; COARSE-GRAINED-NEXT: movs dj5, dj7; mov r25, r3 +; COARSE-GRAINED-NEXT: movs p1, p0; vmov lfl1, x2 +; COARSE-GRAINED-NEXT: .L_LEnd0: +; COARSE-GRAINED-NEXT: nopa ; vldb.pop.576.3d ex0, [p1, lf1, r25, d1]; nops ; nopx ; vmov lfh1, x3; nopv +; COARSE-GRAINED-NEXT: // %bb.3: // %for.cond.cleanup124.i +; COARSE-GRAINED-NEXT: // in Loop: Header=BB0_1 Depth=1 +; COARSE-GRAINED-NEXT: lda m2, [sp, #-344]; nopb ; nopx // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dn2, [sp, #-276] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: nop +; COARSE-GRAINED-NEXT: nop +; COARSE-GRAINED-NEXT: lda dj2, [sp, #-272] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m6, [sp, #-264] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dn6, [sp, #-260] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj6, [sp, #-256] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj0, [sp, #-304]; mov dn0, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m4, [sp, #-296]; movs m0, m2; mov dn4, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-288]; st dn2, [sp, #-276] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj2, [sp, #-272] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc0, [sp, #-300]; st m6, [sp, #-264] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc4, [sp, #-284]; st dn6, [sp, #-260] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc2, [sp, #-268]; st dj6, [sp, #-256] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc6, [sp, #-252]; st dj0, [sp, #-304] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m2, [sp, #-280]; st m4, [sp, #-296] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj4, [sp, #-288] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m0, [sp, #-312] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj0, [sp, #-304]; st dn0, [sp, #-308]; mov p1, #0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m4, [sp, #-296]; paddb.3d [p1], d0; st dn4, [sp, #-292] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dn0, [sp, #-308]; st dc0, [sp, #-300] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dn4, [sp, #-292]; st dc4, [sp, #-284]; mov p0, #0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m2, [sp, #-344]; paddb.3d [p0], d2; st m2, [sp, #-280] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc2, [sp, #-268] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc6, [sp, #-252] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj6, [sp, #-320]; st dj0, [sp, #-304] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m4, [sp, #-296] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m6, [sp, #-328]; st dn0, [sp, #-308] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc2, [sp, #-332]; st dn4, [sp, #-292] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: mov dn2, m2 +; COARSE-GRAINED-NEXT: lda m2, [sp, #-216]; movs dj2, m2; mov dn6, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m0, [sp, #-312]; movs dc6, m2; mov m0, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-288]; movs dj4, dj6; mov dc4, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dc0, [sp, #-300]; st m0, [sp, #-344] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc6, [sp, #-188]; st dj4, [sp, #-320]; xor r2, r8, r0; mov p0, #0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc4, [sp, #-284]; paddb.3d [p0], d2; and r2, r2, r0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dc0, dc2; jnz r2, #.LBB0_1 +; COARSE-GRAINED-NEXT: st dc0, [sp, #-332] // 4-byte Folded Spill Delay Slot 5 +; COARSE-GRAINED-NEXT: st m0, [sp, #-312] // 4-byte Folded Spill Delay Slot 4 +; COARSE-GRAINED-NEXT: st dj4, [sp, #-288] // 4-byte Folded Spill Delay Slot 3 +; COARSE-GRAINED-NEXT: st dc0, [sp, #-300] // 4-byte Folded Spill Delay Slot 2 +; COARSE-GRAINED-NEXT: movs m1, m2; mov dc5, dc6 // Delay Slot 1 +; COARSE-GRAINED-NEXT: // %bb.4: // %ret.exit +; COARSE-GRAINED-NEXT: lda p6, [sp, #-384] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r15, [sp, #-380] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r14, [sp, #-376] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda lr, [sp, #-348] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r13, [sp, #-372] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r12, [sp, #-368] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r11, [sp, #-364] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r10, [sp, #-360] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r9, [sp, #-356] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r8, [sp, #-352] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: ret lr +; COARSE-GRAINED-NEXT: nop // Delay Slot 5 +; COARSE-GRAINED-NEXT: nop // Delay Slot 4 +; COARSE-GRAINED-NEXT: nop // Delay Slot 3 +; COARSE-GRAINED-NEXT: paddxm [sp], #-384 // Delay Slot 2 +; COARSE-GRAINED-NEXT: nop // Delay Slot 1 +entry: + tail call void null(ptr null, ptr null, ptr null) + %2 = trunc i32 %dimsAI.sroa.11.0.copyload.i to i20 + %3 = trunc i32 %dimsAI.sroa.5.0.copyload.i to i20 + %4 = trunc i32 %dimsAI.sroa.7.0.copyload.i to i20 + %5 = trunc i32 %dimsAI.sroa.9.0.copyload.i to i20 + %6 = trunc i32 %dimsAO.sroa.7.0.copyload.i to i20 + %7 = trunc i32 %dimsAO.sroa.4.0.copyload.i to i20 + %8 = trunc i32 %dimsAO.sroa.6.0.copyload.i to i20 + %9 = trunc i32 %dimsAO.sroa.0.0.copyload.i to i20 + %10 = trunc i32 %dimsAO.sroa.5.0.copyload.i to i20 + %11 = trunc i32 %dimsW.sroa.4.0.copyload.i to i20 + %12 = trunc i32 %dimsW.sroa.6.0.copyload.i to i20 + br label %for.body.i + +for.body.i: ; preds = %if.end239.i, %entry + %dimsAI.sroa.13.0458.i = phi i32 [ 0, %entry ], [ %40, %if.end239.i ] + %dimsAO.sroa.10.0457.i = phi i32 [ 0, %entry ], [ %29, %if.end239.i ] + %dimsAO.sroa.8.0456.i = phi i32 [ 0, %entry ], [ %27, %if.end239.i ] + %dimsW.sroa.10.0455.i = phi i32 [ 1, %entry ], [ 0, %if.end239.i ] + %dimsW.sroa.8.0454.i = phi i32 [ 0, %entry ], [ %34, %if.end239.i ] + %iterator_psum_cnt1.0452.i = phi i32 [ 0, %entry ], [ %22, %if.end239.i ] + %iterator_pout_cnt0.0451.i = phi i32 [ 0, %entry ], [ %45, %if.end239.i ] + %13 = trunc i32 0 to i20 + %14 = trunc i32 %iterator_psum_cnt1.0452.i to i20 + %15 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 0, i20 0, i20 0, i20 %13, i20 0, i20 %14) + %16 = extractvalue { ptr, i20, i20 } %15, 2 + %17 = trunc i32 %dimsAI.sroa.13.0458.i to i20 + %18 = tail call { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5) null, <32 x i32> zeroinitializer, i32 0, i20 1, i20 0, i20 %17, i20 %0, i20 0, i20 0, i20 %0) + %19 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %18, 5 + %20 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %18, 6 + br label %for.body125.i + +for.cond.cleanup124.i: ; preds = %for.body125.i + %21 = extractvalue { ptr, i20, i20 } %15, 1 + %22 = zext i20 %16 to i32 + %23 = trunc i32 %dimsAO.sroa.8.0456.i to i20 + %24 = trunc i32 %dimsAO.sroa.10.0457.i to i20 + %25 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 %6, i20 %7, i20 %8, i20 %9, i20 %23, i20 %10, i20 %24) + %26 = extractvalue { ptr, i20, i20 } %25, 1 + %27 = zext i20 %26 to i32 + %28 = extractvalue { ptr, i20, i20 } %25, 2 + %29 = zext i20 %28 to i32 + %30 = trunc i32 %dimsW.sroa.8.0454.i to i20 + %31 = trunc i32 %dimsW.sroa.10.0455.i to i20 + %32 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 %11, i20 %12, i20 0, i20 %30, i20 0, i20 %31) + %33 = extractvalue { ptr, i20, i20 } %32, 1 + %34 = zext i20 %33 to i32 + %35 = extractvalue { ptr, i20, i20 } %32, 2 + br i1 %1, label %if.else.i14, label %if.end239.i + +for.body125.i: ; preds = %for.body125.i, %for.body.i + %36 = trunc i32 0 to i20 + %37 = trunc i32 0 to i20 + %38 = tail call { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5) null, <32 x i32> zeroinitializer, i32 0, i20 %2, i20 0, i20 %36, i20 %3, i20 %4, i20 %37, i20 %5) + %39 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %38, 3 + %40 = zext i20 %39 to i32 + %41 = call i1 @llvm.loop.decrement.i32(i32 0) + br i1 %41, label %for.body125.i, label %for.cond.cleanup124.i + +if.else.i14: ; preds = %for.cond.cleanup124.i + %add.ptr.i327.i = getelementptr i8, ptr null, i20 0 + br label %if.end239.i + +if.end239.i: ; preds = %if.else.i14, %for.cond.cleanup124.i + %42 = trunc i32 %iterator_pout_cnt0.0451.i to i20 + %43 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 0, i20 1, i20 0, i20 %42, i20 0, i20 0) + %44 = extractvalue { ptr, i20, i20 } %43, 1 + %45 = zext i20 %44 to i32 + %46 = extractvalue { ptr, i20, i20 } %43, 2 + br i1 %1, label %ret.exit, label %for.body.i + +ret.exit: ; preds = %if.end239.i + ret void +} + +; Function Attrs: nounwind memory(none) +declare { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr, i20, i20, i20, i20, i20, i20, i20) #0 + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5), <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #1 + +; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn +declare i1 @llvm.loop.decrement.i32(i32) #2 + +; uselistorder directives +uselistorder ptr @llvm.aie2p.add.3d, { 3, 2, 1, 0 } +uselistorder ptr @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5, { 1, 0 } + +attributes #0 = { nounwind memory(none) } +attributes #1 = { nounwind memory(argmem: read) } +attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir new file mode 100644 index 000000000000..6612aebf82ba --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir @@ -0,0 +1,169 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs -start-before=greedy \ +# RUN: -stop-after=aie-unallocated-superreg-rewrite %s -o - | FileCheck %s + +# This example exposes some bundled copies that should be expanded. Please note +# that the bundled copies related to 3d instructions should not be expanded here +# because they already have physical registers assigned (are allocated). + +--- +name: test_expand_copy_bundle +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_expand_copy_bundle + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em_as_32bit = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:edjl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub_dim_size:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_dim_count:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:erf2 = MOV_RLC_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_hi_dim_then_sub_dim_size:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_mod:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_dim_stride:eds = COPY [[COPY]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_hi_dim_then_sub_dim_stride:eds = COPY [[COPY]] + ; CHECK-NEXT: undef [[VBCST_32_:%[0-9]+]].sub_512_lo:vec1024 = VBCST_32 [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[VBCST_32_:%[0-9]+]].sub_512_hi:vec1024 = COPY [[VBCST_32_]].sub_512_lo + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:eldfiforeg = COPY [[VBCST_32_]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:eps = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep, dead [[COPY11:%[0-9]+]]:edcl, [[COPY12:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo2]], [[MOV_PD_imm11_pseudo]], [[COPY1]], [[COPY]], [[COPY11]], undef %29:em_as_32bit, [[COPY2]], [[COPY14]], [[COPY12]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:eds = COPY [[COPY3]] + ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY13]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:spill_edn_to_er = COPY [[COPY4]] { + ; CHECK-NEXT: internal %56:spill_edn_to_er = COPY [[COPY5]] + ; CHECK-NEXT: } + ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_lo_dim:eds = COPY [[COPY15]].sub_lo_dim { + ; CHECK-NEXT: internal [[COPY18]].sub_hi_dim_then_sub_dim_count:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: internal [[COPY18]].sub_hi_dim_then_sub_dim_size:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_size + ; CHECK-NEXT: internal [[COPY18]].sub_hi_dim_then_sub_dim_stride:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_stride + ; CHECK-NEXT: } + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY16:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY16:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY16:%[0-9]+]].sub_avail:epsrfldf, [[COPY18:%[0-9]+]].sub_dim_count:eds, [[COPY18:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split [[COPY16]].sub_ptr, [[COPY16]].sub_fifo, [[COPY16]].sub_avail, [[COPY18]].sub_mod, [[COPY18]].sub_dim_size, [[COPY18]].sub_dim_stride, [[COPY18]].sub_dim_count, undef [[COPY18]].sub_hi_dim_then_sub_mod, [[COPY18]].sub_hi_dim_then_sub_dim_size, [[COPY18]].sub_hi_dim_then_sub_dim_stride, [[COPY18]].sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:spill_edc_to_er = COPY [[COPY18]].sub_dim_count { + ; CHECK-NEXT: internal %55:spill_edc_to_er = COPY [[COPY18]].sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: } + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ednl = COPY [[COPY17]] { + ; CHECK-NEXT: internal [[COPY5]]:ednh = COPY %56 + ; CHECK-NEXT: } + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:edcl = COPY [[COPY19]] + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:edch = COPY %55 + ; CHECK-NEXT: undef [[COPY25:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[COPY25:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY13]] + ; CHECK-NEXT: [[COPY25:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split1:%[0-9]+]]:vec576, dead [[COPY25:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY25:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY25:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY22:%[0-9]+]]:edcl, dead [[COPY24:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY25]].sub_ptr, [[COPY25]].sub_fifo, [[COPY25]].sub_avail, [[COPY20]], [[COPY4]], [[COPY21]], [[COPY22]], undef %53:em_as_32bit, [[COPY5]], [[COPY23]], [[COPY24]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY27:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep, dead [[COPY8:%[0-9]+]]:edcl, [[COPY7:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo3]], [[COPY27]], [[COPY6]], [[COPY28]], [[COPY8]], undef %37:em_as_32bit, [[COPY26]], [[COPY29]], [[COPY7]] + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:ednl = COPY [[COPY26]] + ; CHECK-NEXT: [[COPY32:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY33:%[0-9]+]]:ednh = COPY [[COPY26]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep, dead [[COPY10:%[0-9]+]]:edcl, [[COPY9:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo4]], [[COPY30]], [[COPY31]], [[COPY32]], [[COPY10]], undef %45:em_as_32bit, [[COPY33]], [[COPY34]], [[COPY9]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 + bb.0: + successors: %bb.1(0x80000000) + + undef %90.sub_mod:eds = MOV_PD_imm11_pseudo 0 + %90.sub_dim_stride:eds = COPY %90.sub_mod + %90.sub_dim_size:eds = COPY %90.sub_mod + %90.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + undef %83.sub_dim_size:eds = COPY %90.sub_mod + %83.sub_dim_count:eds = COPY %90.sub_mod + %22:erf2 = MOV_RLC_imm11_pseudo 0 + %83.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + %83.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %83.sub_mod:eds = COPY %90.sub_mod + %83.sub_dim_stride:eds = COPY %90.sub_dim_stride + %83.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + undef %21.sub_512_lo:vec1024 = VBCST_32 %22 + undef %77.sub_dim_size:eds = COPY %90.sub_mod + %77.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + undef %71.sub_dim_size:eds = COPY %90.sub_mod + %71.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %71.sub_dim_count:eds = COPY %90.sub_mod + undef %66.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %66.sub_dim_count:eds = COPY %90.sub_mod + %90.sub_dim_count:eds = COPY %90.sub_mod + %90.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %21.sub_512_hi:vec1024 = COPY %21.sub_512_lo + %64:eldfiforeg = COPY %21 + %20:eps = MOV_PD_imm11_pseudo 0 + + bb.1: + successors: %bb.1(0x80000000) + + %90.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + %8:ep = MOV_PD_imm11_pseudo 0 + dead %8:ep, %90.sub_dim_count:eds, %90.sub_hi_dim_then_sub_dim_count:eds = PADD_3D_pseudo_split %8, %90.sub_mod, %90.sub_dim_size, %90.sub_dim_stride, %90.sub_dim_count, undef %90.sub_hi_dim_then_sub_mod, %90.sub_hi_dim_then_sub_dim_size, %90.sub_hi_dim_then_sub_dim_stride, %90.sub_hi_dim_then_sub_dim_count + %104:eds = COPY %83 + undef %103.sub_ptr:epsrfldf = COPY %20 + %103.sub_fifo:epsrfldf = COPY %64 + %103.sub_avail:epsrfldf = COPY %22 + dead %82:vec576, dead %103.sub_ptr:epsrfldf, dead %103.sub_fifo:epsrfldf, dead %103.sub_avail:epsrfldf, %104.sub_dim_count:eds, %104.sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split %103.sub_ptr, %103.sub_fifo, %103.sub_avail, %104.sub_mod, %104.sub_dim_size, %104.sub_dim_stride, %104.sub_dim_count, undef %104.sub_hi_dim_then_sub_mod, %104.sub_hi_dim_then_sub_dim_size, %104.sub_hi_dim_then_sub_dim_stride, %104.sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + %77.sub_mod:eds = COPY %90.sub_mod + %77.sub_dim_stride:eds = COPY %90.sub_dim_stride + %77.sub_dim_count:eds = COPY %104.sub_dim_count + %77.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + %77.sub_hi_dim_then_sub_dim_count:eds = COPY %104.sub_hi_dim_then_sub_dim_count + undef %105.sub_ptr:epsrfldf = COPY %20 + %105.sub_fifo:epsrfldf = COPY %64 + %105.sub_avail:epsrfldf = COPY %22 + dead %76:vec576, dead %105.sub_ptr:epsrfldf, dead %105.sub_fifo:epsrfldf, dead %105.sub_avail:epsrfldf, %77.sub_dim_count:eds, %77.sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split %105.sub_ptr, %105.sub_fifo, %105.sub_avail, %77.sub_mod, %77.sub_dim_size, %77.sub_dim_stride, %77.sub_dim_count, undef %77.sub_hi_dim_then_sub_mod, %77.sub_hi_dim_then_sub_dim_size, %77.sub_hi_dim_then_sub_dim_stride, %77.sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + %33:ep = MOV_PD_imm11_pseudo 0 + %71.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + %71.sub_mod:eds = COPY %90.sub_mod + %71.sub_dim_stride:eds = COPY %90.sub_dim_stride + %71.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + dead %33:ep, %71.sub_dim_count:eds, %71.sub_hi_dim_then_sub_dim_count:eds = PADD_3D_pseudo_split %33, %71.sub_mod, %71.sub_dim_size, %71.sub_dim_stride, %71.sub_dim_count, undef %71.sub_hi_dim_then_sub_mod, %71.sub_hi_dim_then_sub_dim_size, %71.sub_hi_dim_then_sub_dim_stride, %71.sub_hi_dim_then_sub_dim_count + %66.sub_mod:eds = COPY %90.sub_mod + %66.sub_dim_size:eds = COPY %71.sub_hi_dim_then_sub_dim_size + %66.sub_dim_stride:eds = COPY %90.sub_dim_stride + %66.sub_hi_dim_then_sub_dim_size:eds = COPY %71.sub_hi_dim_then_sub_dim_size + %71.sub_dim_count:eds = MOV_PD_imm11_pseudo 1 + %39:ep = MOV_PD_imm11_pseudo 0 + %66.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + dead %39:ep, %66.sub_dim_count:eds, %66.sub_hi_dim_then_sub_dim_count:eds = PADD_3D_pseudo_split %39, %66.sub_mod, %66.sub_dim_size, %66.sub_dim_stride, %66.sub_dim_count, undef %66.sub_hi_dim_then_sub_mod, %66.sub_hi_dim_then_sub_dim_size, %66.sub_hi_dim_then_sub_dim_stride, %66.sub_hi_dim_then_sub_dim_count + %66.sub_dim_count:eds = MOV_PD_imm11_pseudo 1 + %90.sub_dim_count:eds = MOV_PD_imm11_pseudo 1 + PseudoJ_jump_imm %bb.1 + +... From 925f9d9816604316711cd2cd80d7b897253d5163 Mon Sep 17 00:00:00 2001 From: Andreu Carminati Date: Mon, 13 Oct 2025 09:30:35 -0600 Subject: [PATCH 9/9] [AIEX] Expand copy bundles for unallocated dest VRegs This avoids cycles in bundles that appear in VirtRegRewriter. We also update LIs related to src and dst operands of those expanded copies. Co-Authored-By: Krishnam Tibrewala --- .../AIE/AIEUnallocatedSuperRegRewriter.cpp | 64 +++++++++ .../AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll | 129 +++++++++++++++--- .../ra/staged-rewrite-expand-copy-bundle.mir | 62 ++++----- 3 files changed, 204 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp b/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp index e70126327f37..dbf070c91a3e 100644 --- a/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp +++ b/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp @@ -124,6 +124,67 @@ void rewriteCandidates(RegRewriteInfo &RegistersToRewrite, } } +/// Unbundle COPY/KILL instruction bundles for registers being rewritten. +/// Bundled instructions are separated into individual instructions with updated +/// slot indexes, and live intervals are repaired for affected registers. +static void expandCopyBundles(RegRewriteInfo &RegistersToRewrite, + MachineRegisterInfo &MRI, SlotIndexes &Indexes, + LiveIntervals &LIS, VirtRegMap &VRM, + LiveRegMatrix &LRM) { + + SmallSet RegistersToRepair; + for (auto [VReg, SubRegs] : RegistersToRewrite) { + + for (MachineInstr &MI : MRI.def_instructions(VReg)) { + + // Finding the last instruction in a COPY/KILL bundle (which has a + // predecessor but no successor). + if (!MI.isBundledWithPred() || MI.isBundledWithSucc()) + continue; + + SmallVector MIs({&MI}); + + // Walking backwards through the bundle to collect all bundled + // instructions. + // Only do this when the complete bundle is made out of COPYs and KILLs. + MachineBasicBlock &MBB = *MI.getParent(); + for (MachineBasicBlock::reverse_instr_iterator + I = std::next(MI.getReverseIterator()), + E = MBB.instr_rend(); + I != E && I->isBundledWithSucc(); ++I) { + if (!I->isCopy() && !I->isKill()) + break; + MIs.push_back(&*I); + } + + // Unbundling them one by one from the end. + MachineInstr *FirstMI = MIs.back(); + MachineInstr *BundleStart = FirstMI; + for (MachineInstr *BundledMI : llvm::reverse(MIs)) { + // If instruction is in the middle of the bundle, move it before the + // bundle starts, otherwise, just unbundle it. When we get to the last + // instruction, the bundle will have been completely undone. + if (BundledMI != BundleStart) { + BundledMI->removeFromBundle(); + MBB.insert(BundleStart, BundledMI); + } else if (BundledMI->isBundledWithSucc()) { + BundledMI->unbundleFromSucc(); + BundleStart = &*std::next(BundledMI->getIterator()); + } + + if (BundledMI != FirstMI) { + Indexes.insertMachineInstrInMaps(*BundledMI); + RegistersToRepair.insert(BundledMI->getOperand(0).getReg()); + RegistersToRepair.insert(BundledMI->getOperand(1).getReg()); + BundledMI->getOperand(0).setIsInternalRead(false); + } + } + } + } + + AIESuperRegUtils::repairLiveIntervals(RegistersToRepair, VRM, LRM, LIS); +} + bool AIEUnallocatedSuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(llvm::dbgs() << "*** Splitting unallocated super-registers: " << MF.getName() << " ***\n"); @@ -146,6 +207,9 @@ bool AIEUnallocatedSuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { return false; } + LLVM_DEBUG(dbgs() << "Expanding copy bundles...\n"); + expandCopyBundles(RegistersToRewrite, MRI, Indexes, LIS, VRM, LRM); + LLVM_DEBUG(dbgs() << "Performing register rewrites...\n"); rewriteCandidates(RegistersToRewrite, MRI, TRI, VRM, LRM, LIS, Indexes, DebugVars); diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll index 22094118685d..785e3e6c6e49 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll @@ -6,34 +6,125 @@ ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; ; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates -; RUN: not llc -mtriple aie2p -o %t.s %s 2>&1 | FileCheck %s --check-prefix=BUNDLE-ERROR +; RUN: llc -mtriple=aie2p -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=FINE-GRAINED ; RUN: llc -mtriple=aie2p --aie-staged-ra-fine-grained-alloc=false %s -o - | FileCheck %s --check-prefix=COARSE-GRAINED ; Function Attrs: nounwind readnone -; BUNDLE-ERROR: error: register rewriting failed: cycle in copy bundle define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0.copyload.i, i32 %dimsAI.sroa.9.0.copyload.i, i32 %dimsAO.sroa.7.0.copyload.i, i32 %dimsAO.sroa.4.0.copyload.i, i32 %dimsAO.sroa.6.0.copyload.i, i32 %dimsAO.sroa.0.0.copyload.i, i32 %dimsAO.sroa.5.0.copyload.i, i32 %dimsW.sroa.4.0.copyload.i, i32 %dimsW.sroa.6.0.copyload.i, i20 %0, i1 %1, i32 %dimsAI.sroa.11.0.copyload.i) { +; FINE-GRAINED-LABEL: heavy_3d_user: +; FINE-GRAINED: // %bb.0: // %entry +; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; paddxm [sp], #192; nopv +; FINE-GRAINED-NEXT: st r13, [sp, #-180]; nopx // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r14, [sp, #-184] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r15, [sp, #-188] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r9, [sp, #-164] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r10, [sp, #-168] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: mova m0, #-196; st r11, [sp, #-172]; mov p1, sp // 4-byte Folded Spill +; FINE-GRAINED-NEXT: padda [p1], m0; st p6, [sp, #-192] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda dj0, [p1], #-4; st lr, [sp, #-156] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda dj0, [p1], #-4; st r8, [sp, #-160] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda r8, [p1, #-4]; st r12, [sp, #-176]; movx r16, #0; mov p3, #0 // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda r12, [p1, #0]; st r0, [sp, #-144]; vbcst.32 x0, r16 // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r1, [sp, #-140]; jl p3; vmov x1, x0 // 4-byte Folded Spill +; FINE-GRAINED-NEXT: vst x0, [sp, #-128] // 64-byte Folded Spill Delay Slot 5 +; FINE-GRAINED-NEXT: vst x1, [sp, #-64]; mov p6, p0 // 64-byte Folded Spill Delay Slot 4 +; FINE-GRAINED-NEXT: mova p2, #0; st dj0, [sp, #-152]; or r13, r2, r2; mov r14, r3 // 4-byte Folded Spill Delay Slot 3 +; FINE-GRAINED-NEXT: mova p0, #0; st dj0, [sp, #-148]; or r15, r4, r4; mov r9, r5 // 4-byte Folded Spill Delay Slot 2 +; FINE-GRAINED-NEXT: mova p1, #0; or r10, r6, r6; mov r11, r7 // Delay Slot 1 +; FINE-GRAINED-NEXT: movs dn3, r10; mov dj3, r15 +; FINE-GRAINED-NEXT: mova dn1, #0; movs m3, r14; mov dj7, r9 +; FINE-GRAINED-NEXT: vlda x2, [sp, #-128]; movs dn7, r11; mov dj1, #1 // 64-byte Folded Reload +; FINE-GRAINED-NEXT: vlda x3, [sp, #-64]; movs m4, dj1; mov r3, dn1 // 64-byte Folded Reload +; FINE-GRAINED-NEXT: mova dc0, #0; movs dc2, dn1; mov r4, dn1 +; FINE-GRAINED-NEXT: lda r22, [sp, #-152]; movs dc7, dn1; mov r20, dn1 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r21, [sp, #-148]; movs dc3, dn1; mov r19, dn1 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r0, [sp, #-144]; movs dc4, dj1; mov r5, dn1 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: mova m5, #0; movs dj4, dj1; mov r6, dj1 +; FINE-GRAINED-NEXT: mova r7, #1; movs dj0, m5; movx r18, #0; vmov lfl0, x2 +; FINE-GRAINED-NEXT: lda r1, [sp, #-140]; movs dn4, m5; and r16, r12, r7; vmov lfh0, x3 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: .LBB0_1: // %for.body.i +; FINE-GRAINED-NEXT: // =>This Loop Header: Depth=1 +; FINE-GRAINED-NEXT: // Child Loop BB0_2 Depth 2 +; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; nopx ; mov dn2, r3; nopv +; FINE-GRAINED-NEXT: movs dj2, p6; nopx ; mov dn6, r3 +; FINE-GRAINED-NEXT: movs dj6, p6; mov m2, m4 +; FINE-GRAINED-NEXT: mova p1, #0; movs dc6, r4; mov r25, r18 +; FINE-GRAINED-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d2] +; FINE-GRAINED-NEXT: mov m1, m5 +; FINE-GRAINED-NEXT: movs dj1, m5; mov dn1, r3 +; FINE-GRAINED-NEXT: movs dc1, dc0; vmov lfl1, lfl0 +; FINE-GRAINED-NEXT: movs dn5, r3; vmov lfh1, lfh0 +; FINE-GRAINED-NEXT: mova p0, #0; movs dj5, m5; mov dc5, r19 +; FINE-GRAINED-NEXT: paddb.3d [p0], d1 +; FINE-GRAINED-NEXT: mova p0, #0; mov r19, dc5 +; FINE-GRAINED-NEXT: .LBB0_2: // %for.body125.i +; FINE-GRAINED-NEXT: // Parent Loop BB0_1 Depth=1 +; FINE-GRAINED-NEXT: // => This Inner Loop Header: Depth=2 +; FINE-GRAINED-NEXT: nopa ; nopb ; nopx ; mov dc6, dc0 +; FINE-GRAINED-NEXT: mov dn2, r3 +; FINE-GRAINED-NEXT: movs dc2, dc0; mov dj2, r0 +; FINE-GRAINED-NEXT: movs m2, r8; mov dj6, r13 +; FINE-GRAINED-NEXT: movs dn6, r1; mov r25, r18 +; FINE-GRAINED-NEXT: movs p1, p0; vmov lfl1, x2 +; FINE-GRAINED-NEXT: .L_LEnd0: +; FINE-GRAINED-NEXT: nopa ; vldb.pop.576.3d ex4, [p1, lf1, r25, d2]; nops ; nopx ; vmov lfh1, x3; nopv +; FINE-GRAINED-NEXT: // %bb.3: // %for.cond.cleanup124.i +; FINE-GRAINED-NEXT: // in Loop: Header=BB0_1 Depth=1 +; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; nopx ; mov m0, m5; nopv +; FINE-GRAINED-NEXT: movs dn0, m5; nopx ; mov m1, m3 +; FINE-GRAINED-NEXT: movs dn1, dn3; mov dj1, dj3 +; FINE-GRAINED-NEXT: mova p0, #0; movs dn5, dn7; mov dj5, dj7 +; FINE-GRAINED-NEXT: movs dc0, r5; paddb.3d [p0], d3; mov dj7, r21 +; FINE-GRAINED-NEXT: movs dj3, r22; mov dn3, m5 +; FINE-GRAINED-NEXT: movs m3, m5; mov dn7, m5 +; FINE-GRAINED-NEXT: movs dc1, dc3; xor r17, r12, r7; mov dc5, dc7 +; FINE-GRAINED-NEXT: movs dc3, r20; and r17, r17, r7; mov dc7, dc4 +; FINE-GRAINED-NEXT: mova p1, #0; movs dc4, m5; jnz r17, #.LBB0_1 +; FINE-GRAINED-NEXT: movs m3, m1; paddb.3d [p1], d3; mov dn3, dn1 // Delay Slot 5 +; FINE-GRAINED-NEXT: mova p0, #0; movs dj3, dj1; mov dn7, dn5 // Delay Slot 4 +; FINE-GRAINED-NEXT: movs dj7, dj5; paddb.3d [p0], d0; mov r20, dc3 // Delay Slot 3 +; FINE-GRAINED-NEXT: movs dc4, m5; mov dc3, dc1 // Delay Slot 2 +; FINE-GRAINED-NEXT: mova dc0, #0; movs dc7, dc5; mov r5, dc0 // Delay Slot 1 +; FINE-GRAINED-NEXT: // %bb.4: // %ret.exit +; FINE-GRAINED-NEXT: lda p6, [sp, #-192] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r15, [sp, #-188] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r14, [sp, #-184] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda lr, [sp, #-156] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r13, [sp, #-180] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r12, [sp, #-176] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r11, [sp, #-172] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r10, [sp, #-168] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r9, [sp, #-164] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r8, [sp, #-160] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: ret lr +; FINE-GRAINED-NEXT: nop // Delay Slot 5 +; FINE-GRAINED-NEXT: nop // Delay Slot 4 +; FINE-GRAINED-NEXT: nop // Delay Slot 3 +; FINE-GRAINED-NEXT: paddxm [sp], #-192 // Delay Slot 2 +; FINE-GRAINED-NEXT: nop // Delay Slot 1 +; ; COARSE-GRAINED-LABEL: heavy_3d_user: ; COARSE-GRAINED: // %bb.0: // %entry -; COARSE-GRAINED-NEXT: nopa ; nopb ; paddxm [sp], #384; nops -; COARSE-GRAINED-NEXT: mova m0, #-388; st r9, [sp, #-356]; mov p1, sp // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: mova m0, #-392; paddb [p1], m0; st r10, [sp, #-360] // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: lda dj0, [p1, #0]; st r11, [sp, #-364]; mov p1, sp // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: mova m0, #-400; paddb [p1], m0; st r12, [sp, #-368] // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: lda dj4, [p1, #0]; st r13, [sp, #-372]; mov p1, sp // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: padda [p1], m0; st r14, [sp, #-376] // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: lda m0, [p1, #0]; st r15, [sp, #-380] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: nopa ; nopb ; nops ; paddxm [sp], #384; nopv +; COARSE-GRAINED-NEXT: st r9, [sp, #-356]; nopb ; nopx // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r10, [sp, #-360] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r11, [sp, #-364] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova m0, #-388; st r12, [sp, #-368]; mov p1, sp // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: padda [p1], m0; st r13, [sp, #-372] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj0, [p1], #-4; st r14, [sp, #-376] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r15, [sp, #-380] // 4-byte Folded Spill ; COARSE-GRAINED-NEXT: st p6, [sp, #-384] // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: mova r16, #0; st lr, [sp, #-348] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj4, [p1], #-4; st lr, [sp, #-348]; movx r16, #0 // 4-byte Folded Spill ; COARSE-GRAINED-NEXT: st r8, [sp, #-352]; vbcst.32 x0, r16 // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: st r0, [sp, #-248]; mov p6, p0 // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: vst x0, [sp, #-128]; mov p1, sp // 64-byte Folded Spill +; COARSE-GRAINED-NEXT: st r0, [sp, #-248] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m0, [p1, #-4]; vst x0, [sp, #-128]; mov p6, p0 // 64-byte Folded Spill ; COARSE-GRAINED-NEXT: st dj0, [sp, #-304] // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: mova m0, #-396; st m0, [sp, #-280] // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: padda [p1], m0; st dj0, [sp, #-272]; vmov x1, x0 // 4-byte Folded Spill -; COARSE-GRAINED-NEXT: lda r8, [p1, #0]; st dj0, [sp, #-336]; mov p3, #0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-272] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-336]; vmov x1, x0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda r8, [p1, #0]; st dj4, [sp, #-288]; mov p3, #0 // 4-byte Folded Spill ; COARSE-GRAINED-NEXT: vst x1, [sp, #-64]; jl p3 // 64-byte Folded Spill -; COARSE-GRAINED-NEXT: mova p2, #0; st dj4, [sp, #-288] // 4-byte Folded Spill Delay Slot 5 -; COARSE-GRAINED-NEXT: mova dj4, #1; st dj4, [sp, #-256]; mov r9, r1 // 4-byte Folded Spill Delay Slot 4 +; COARSE-GRAINED-NEXT: mova p2, #0; st dj4, [sp, #-256] // 4-byte Folded Spill Delay Slot 5 +; COARSE-GRAINED-NEXT: mova dj4, #1; st m0, [sp, #-280]; mov r9, r1 // 4-byte Folded Spill Delay Slot 4 ; COARSE-GRAINED-NEXT: mova m0, #0; st dj4, [sp, #-320]; or r10, r2, r2; mov r11, r3 // 4-byte Folded Spill Delay Slot 3 ; COARSE-GRAINED-NEXT: mova p0, #0; st m0, [sp, #-344]; or r12, r4, r4; mov r13, r5 // 4-byte Folded Spill Delay Slot 2 ; COARSE-GRAINED-NEXT: mova p1, #0; or r14, r6, r6; mov r15, r7 // Delay Slot 1 @@ -115,7 +206,7 @@ define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0 ; COARSE-GRAINED-NEXT: movs dj5, dj7; mov r25, r3 ; COARSE-GRAINED-NEXT: movs p1, p0; vmov lfl1, x2 ; COARSE-GRAINED-NEXT: .L_LEnd0: -; COARSE-GRAINED-NEXT: nopa ; vldb.pop.576.3d ex0, [p1, lf1, r25, d1]; nops ; nopx ; vmov lfh1, x3; nopv +; COARSE-GRAINED-NEXT: nopa ; vldb.pop.576.3d ex4, [p1, lf1, r25, d1]; nops ; nopx ; vmov lfh1, x3; nopv ; COARSE-GRAINED-NEXT: // %bb.3: // %for.cond.cleanup124.i ; COARSE-GRAINED-NEXT: // in Loop: Header=BB0_1 Depth=1 ; COARSE-GRAINED-NEXT: lda m2, [sp, #-344]; nopb ; nopx // 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir index 6612aebf82ba..c276bfb65972 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir @@ -58,44 +58,42 @@ body: | ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY13]] ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:spill_edn_to_er = COPY [[COPY4]] { - ; CHECK-NEXT: internal %56:spill_edn_to_er = COPY [[COPY5]] - ; CHECK-NEXT: } - ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub_lo_dim:eds = COPY [[COPY15]].sub_lo_dim { - ; CHECK-NEXT: internal [[COPY18]].sub_hi_dim_then_sub_dim_count:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_count - ; CHECK-NEXT: internal [[COPY18]].sub_hi_dim_then_sub_dim_size:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_size - ; CHECK-NEXT: internal [[COPY18]].sub_hi_dim_then_sub_dim_stride:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_stride - ; CHECK-NEXT: } - ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY16:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY16:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY16:%[0-9]+]].sub_avail:epsrfldf, [[COPY18:%[0-9]+]].sub_dim_count:eds, [[COPY18:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split [[COPY16]].sub_ptr, [[COPY16]].sub_fifo, [[COPY16]].sub_avail, [[COPY18]].sub_mod, [[COPY18]].sub_dim_size, [[COPY18]].sub_dim_stride, [[COPY18]].sub_dim_count, undef [[COPY18]].sub_hi_dim_then_sub_mod, [[COPY18]].sub_hi_dim_then_sub_dim_size, [[COPY18]].sub_hi_dim_then_sub_dim_stride, [[COPY18]].sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:spill_edc_to_er = COPY [[COPY18]].sub_dim_count { - ; CHECK-NEXT: internal %55:spill_edc_to_er = COPY [[COPY18]].sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:spill_edn_to_er = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:spill_edn_to_er = COPY [[COPY5]] + ; CHECK-NEXT: undef [[COPY19:%[0-9]+]].sub_lo_dim:eds = COPY [[COPY15]].sub_lo_dim { + ; CHECK-NEXT: internal [[COPY19]].sub_hi_dim_then_sub_dim_count:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: internal [[COPY19]].sub_hi_dim_then_sub_dim_size:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_size + ; CHECK-NEXT: internal [[COPY19]].sub_hi_dim_then_sub_dim_stride:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_stride ; CHECK-NEXT: } + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY16:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY16:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY16:%[0-9]+]].sub_avail:epsrfldf, [[COPY19:%[0-9]+]].sub_dim_count:eds, [[COPY19:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split [[COPY16]].sub_ptr, [[COPY16]].sub_fifo, [[COPY16]].sub_avail, [[COPY19]].sub_mod, [[COPY19]].sub_dim_size, [[COPY19]].sub_dim_stride, [[COPY19]].sub_dim_count, undef [[COPY19]].sub_hi_dim_then_sub_mod, [[COPY19]].sub_hi_dim_then_sub_dim_size, [[COPY19]].sub_hi_dim_then_sub_dim_stride, [[COPY19]].sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:spill_edc_to_er = COPY [[COPY19]].sub_dim_count + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:spill_edc_to_er = COPY [[COPY19]].sub_hi_dim_then_sub_dim_count ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ednl = COPY [[COPY17]] { - ; CHECK-NEXT: internal [[COPY5]]:ednh = COPY %56 + ; CHECK-NEXT: internal [[COPY5]]:ednh = COPY [[COPY18]] ; CHECK-NEXT: } - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY21:%[0-9]+]]:edjl = COPY [[COPY]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]]:edcl = COPY [[COPY19]] - ; CHECK-NEXT: [[COPY23:%[0-9]+]]:edjh = COPY [[COPY]] - ; CHECK-NEXT: [[COPY24:%[0-9]+]]:edch = COPY %55 - ; CHECK-NEXT: undef [[COPY25:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] - ; CHECK-NEXT: [[COPY25:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY13]] - ; CHECK-NEXT: [[COPY25:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] - ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split1:%[0-9]+]]:vec576, dead [[COPY25:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY25:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY25:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY22:%[0-9]+]]:edcl, dead [[COPY24:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY25]].sub_ptr, [[COPY25]].sub_fifo, [[COPY25]].sub_avail, [[COPY20]], [[COPY4]], [[COPY21]], [[COPY22]], undef %53:em_as_32bit, [[COPY5]], [[COPY23]], [[COPY24]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:edcl = COPY [[COPY20]] + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:edch = COPY [[COPY21]] + ; CHECK-NEXT: undef [[COPY27:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[COPY27:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY13]] + ; CHECK-NEXT: [[COPY27:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split1:%[0-9]+]]:vec576, dead [[COPY27:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY27:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY27:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY24:%[0-9]+]]:edcl, dead [[COPY26:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY27]].sub_ptr, [[COPY27]].sub_fifo, [[COPY27]].sub_avail, [[COPY22]], [[COPY4]], [[COPY23]], [[COPY24]], undef %53:em_as_32bit, [[COPY5]], [[COPY25]], [[COPY26]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 - ; CHECK-NEXT: [[COPY26:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY27:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY28:%[0-9]+]]:edjl = COPY [[COPY]] - ; CHECK-NEXT: [[COPY29:%[0-9]+]]:edjh = COPY [[COPY]] - ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep, dead [[COPY8:%[0-9]+]]:edcl, [[COPY7:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo3]], [[COPY27]], [[COPY6]], [[COPY28]], [[COPY8]], undef %37:em_as_32bit, [[COPY26]], [[COPY29]], [[COPY7]] - ; CHECK-NEXT: [[COPY30:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] - ; CHECK-NEXT: [[COPY31:%[0-9]+]]:ednl = COPY [[COPY26]] - ; CHECK-NEXT: [[COPY32:%[0-9]+]]:edjl = COPY [[COPY]] - ; CHECK-NEXT: [[COPY33:%[0-9]+]]:ednh = COPY [[COPY26]] + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep, dead [[COPY8:%[0-9]+]]:edcl, [[COPY7:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo3]], [[COPY29]], [[COPY6]], [[COPY30]], [[COPY8]], undef %37:em_as_32bit, [[COPY28]], [[COPY31]], [[COPY7]] + ; CHECK-NEXT: [[COPY32:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY33:%[0-9]+]]:ednl = COPY [[COPY28]] + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY35:%[0-9]+]]:ednh = COPY [[COPY28]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 - ; CHECK-NEXT: [[COPY34:%[0-9]+]]:edjh = COPY [[COPY]] - ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep, dead [[COPY10:%[0-9]+]]:edcl, [[COPY9:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo4]], [[COPY30]], [[COPY31]], [[COPY32]], [[COPY10]], undef %45:em_as_32bit, [[COPY33]], [[COPY34]], [[COPY9]] + ; CHECK-NEXT: [[COPY36:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep, dead [[COPY10:%[0-9]+]]:edcl, [[COPY9:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo4]], [[COPY32]], [[COPY33]], [[COPY34]], [[COPY10]], undef %45:em_as_32bit, [[COPY35]], [[COPY36]], [[COPY9]] ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 ; CHECK-NEXT: PseudoJ_jump_imm %bb.1