Skip to content

Commit 31b7e71

Browse files
[AIEX] Expand copy bundles for unallocated dest VRegs
This avoids cycles in bundles that appear in VirtRegRewriter. We also update LIs related to src and dst operands of those expanded copies. Co-Authored-By: Krishnam Tibrewala <[email protected]>
1 parent b06ff99 commit 31b7e71

File tree

3 files changed

+195
-50
lines changed

3 files changed

+195
-50
lines changed

llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/CodeGen/TargetSubtargetInfo.h"
3333
#include "llvm/CodeGen/VirtRegMap.h"
3434
#include "llvm/Support/Debug.h"
35+
#include "llvm/Support/ErrorHandling.h"
3536
#include "llvm/Support/raw_ostream.h"
3637

3738
using namespace llvm;
@@ -109,6 +110,59 @@ void rewriteCandidates(RegRewriteInfo &RegistersToRewrite,
109110
}
110111
}
111112

113+
static void expandCopyBundles(RegRewriteInfo &RegistersToRewrite,
114+
MachineRegisterInfo &MRI, SlotIndexes &Indexes,
115+
LiveIntervals &LIS, VirtRegMap &VRM,
116+
LiveRegMatrix &LRM) {
117+
118+
SmallSet<Register, 8> RegistersToRepair;
119+
for (auto [VReg, SubRegs] : RegistersToRewrite) {
120+
121+
for (MachineInstr &MI : MRI.def_instructions(VReg)) {
122+
123+
if (!MI.isBundledWithPred() || MI.isBundledWithSucc())
124+
continue;
125+
126+
SmallVector<MachineInstr *, 2> MIs({&MI});
127+
128+
// Only do this when the complete bundle is made out of COPYs and KILLs.
129+
MachineBasicBlock &MBB = *MI.getParent();
130+
for (MachineBasicBlock::reverse_instr_iterator
131+
I = std::next(MI.getReverseIterator()),
132+
E = MBB.instr_rend();
133+
I != E && I->isBundledWithSucc(); ++I) {
134+
if (!I->isCopy() && !I->isKill())
135+
break;
136+
MIs.push_back(&*I);
137+
}
138+
MachineInstr *FirstMI = MIs.back();
139+
140+
MachineInstr *BundleStart = FirstMI;
141+
for (MachineInstr *BundledMI : llvm::reverse(MIs)) {
142+
// If instruction is in the middle of the bundle, move it before the
143+
// bundle starts, otherwise, just unbundle it. When we get to the last
144+
// instruction, the bundle will have been completely undone.
145+
if (BundledMI != BundleStart) {
146+
BundledMI->removeFromBundle();
147+
MBB.insert(BundleStart, BundledMI);
148+
} else if (BundledMI->isBundledWithSucc()) {
149+
BundledMI->unbundleFromSucc();
150+
BundleStart = &*std::next(BundledMI->getIterator());
151+
}
152+
153+
if (BundledMI != FirstMI) {
154+
Indexes.insertMachineInstrInMaps(*BundledMI);
155+
RegistersToRepair.insert(BundledMI->getOperand(0).getReg());
156+
RegistersToRepair.insert(BundledMI->getOperand(1).getReg());
157+
BundledMI->getOperand(0).setIsInternalRead(false);
158+
}
159+
}
160+
}
161+
}
162+
163+
AIESuperRegUtils::updateLRMAndLIS(RegistersToRepair, VRM, LRM, LIS);
164+
}
165+
112166
bool AIEUnallocatedSuperRegRewriter::runOnMachineFunction(MachineFunction &MF) {
113167
LLVM_DEBUG(llvm::dbgs() << "*** Splitting unallocated super-registers: "
114168
<< MF.getName() << " ***\n");
@@ -125,6 +179,8 @@ bool AIEUnallocatedSuperRegRewriter::runOnMachineFunction(MachineFunction &MF) {
125179

126180
RegRewriteInfo RegistersToRewrite = getRewriteCandidates(MRI, TRI, VRM);
127181

182+
expandCopyBundles(RegistersToRewrite, MRI, Indexes, LIS, VRM, LRM);
183+
128184
rewriteCandidates(RegistersToRewrite, MRI, TRI, VRM, LRM, LIS, Indexes,
129185
DebugVars);
130186

llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll

Lines changed: 109 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,125 @@
66
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
77
;
88
; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
9-
; RUN: not llc -mtriple aie2p -o %t.s %s 2>&1 | FileCheck %s --check-prefix=BUNDLE-ERROR
9+
; RUN: llc -mtriple=aie2p -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=FINE-GRAINED
1010
; RUN: llc -mtriple=aie2p --aie-staged-ra-fine-grained-alloc=false %s -o - | FileCheck %s --check-prefix=COARSE-GRAINED
1111

1212
; Function Attrs: nounwind readnone
13-
; BUNDLE-ERROR: error: register rewriting failed: cycle in copy bundle
1413
define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0.copyload.i, i32 %dimsAI.sroa.9.0.copyload.i, i32 %dimsAO.sroa.7.0.copyload.i, i32 %dimsAO.sroa.4.0.copyload.i, i32 %dimsAO.sroa.6.0.copyload.i, i32 %dimsAO.sroa.0.0.copyload.i, i32 %dimsAO.sroa.5.0.copyload.i, i32 %dimsW.sroa.4.0.copyload.i, i32 %dimsW.sroa.6.0.copyload.i, i20 %0, i1 %1, i32 %dimsAI.sroa.11.0.copyload.i) {
14+
; FINE-GRAINED-LABEL: heavy_3d_user:
15+
; FINE-GRAINED: // %bb.0: // %entry
16+
; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; paddxm [sp], #192; nopv
17+
; FINE-GRAINED-NEXT: st r13, [sp, #-180]; nopx // 4-byte Folded Spill
18+
; FINE-GRAINED-NEXT: st r14, [sp, #-184] // 4-byte Folded Spill
19+
; FINE-GRAINED-NEXT: st r15, [sp, #-188] // 4-byte Folded Spill
20+
; FINE-GRAINED-NEXT: st r9, [sp, #-164] // 4-byte Folded Spill
21+
; FINE-GRAINED-NEXT: st r10, [sp, #-168] // 4-byte Folded Spill
22+
; FINE-GRAINED-NEXT: mova m0, #-196; st r11, [sp, #-172]; mov p1, sp // 4-byte Folded Spill
23+
; FINE-GRAINED-NEXT: padda [p1], m0; st p6, [sp, #-192] // 4-byte Folded Spill
24+
; FINE-GRAINED-NEXT: lda dj0, [p1], #-4; st lr, [sp, #-156] // 4-byte Folded Spill
25+
; FINE-GRAINED-NEXT: lda dj0, [p1], #-4; st r8, [sp, #-160] // 4-byte Folded Spill
26+
; FINE-GRAINED-NEXT: lda r8, [p1, #-4]; st r12, [sp, #-176]; movx r16, #0; mov p3, #0 // 4-byte Folded Spill
27+
; FINE-GRAINED-NEXT: lda r12, [p1, #0]; st r0, [sp, #-144]; vbcst.32 x0, r16 // 4-byte Folded Spill
28+
; FINE-GRAINED-NEXT: st r1, [sp, #-140]; jl p3; vmov x1, x0 // 4-byte Folded Spill
29+
; FINE-GRAINED-NEXT: vst x0, [sp, #-128] // 64-byte Folded Spill Delay Slot 5
30+
; FINE-GRAINED-NEXT: vst x1, [sp, #-64]; mov p6, p0 // 64-byte Folded Spill Delay Slot 4
31+
; FINE-GRAINED-NEXT: mova p2, #0; st dj0, [sp, #-152]; or r13, r2, r2; mov r14, r3 // 4-byte Folded Spill Delay Slot 3
32+
; FINE-GRAINED-NEXT: mova p0, #0; st dj0, [sp, #-148]; or r15, r4, r4; mov r9, r5 // 4-byte Folded Spill Delay Slot 2
33+
; FINE-GRAINED-NEXT: mova p1, #0; or r10, r6, r6; mov r11, r7 // Delay Slot 1
34+
; FINE-GRAINED-NEXT: movs dn3, r10; mov dj3, r15
35+
; FINE-GRAINED-NEXT: mova dn1, #0; movs m3, r14; mov dj7, r9
36+
; FINE-GRAINED-NEXT: vlda x2, [sp, #-128]; movs dn7, r11; mov dj1, #1 // 64-byte Folded Reload
37+
; FINE-GRAINED-NEXT: vlda x3, [sp, #-64]; movs m4, dj1; mov r3, dn1 // 64-byte Folded Reload
38+
; FINE-GRAINED-NEXT: mova dc0, #0; movs dc2, dn1; mov r4, dn1
39+
; FINE-GRAINED-NEXT: lda r22, [sp, #-152]; movs dc7, dn1; mov r20, dn1 // 4-byte Folded Reload
40+
; FINE-GRAINED-NEXT: lda r21, [sp, #-148]; movs dc3, dn1; mov r19, dn1 // 4-byte Folded Reload
41+
; FINE-GRAINED-NEXT: lda r0, [sp, #-144]; movs dc4, dj1; mov r5, dn1 // 4-byte Folded Reload
42+
; FINE-GRAINED-NEXT: mova m5, #0; movs dj4, dj1; mov r6, dj1
43+
; FINE-GRAINED-NEXT: mova r7, #1; movs dj0, m5; movx r18, #0; vmov lfl0, x2
44+
; FINE-GRAINED-NEXT: lda r1, [sp, #-140]; movs dn4, m5; and r16, r12, r7; vmov lfh0, x3 // 4-byte Folded Reload
45+
; FINE-GRAINED-NEXT: .LBB0_1: // %for.body.i
46+
; FINE-GRAINED-NEXT: // =>This Loop Header: Depth=1
47+
; FINE-GRAINED-NEXT: // Child Loop BB0_2 Depth 2
48+
; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; nopx ; mov dn2, r3; nopv
49+
; FINE-GRAINED-NEXT: movs dj2, p6; nopx ; mov dn6, r3
50+
; FINE-GRAINED-NEXT: movs dj6, p6; mov m2, m4
51+
; FINE-GRAINED-NEXT: mova p1, #0; movs dc6, r4; mov r25, r18
52+
; FINE-GRAINED-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d2]
53+
; FINE-GRAINED-NEXT: mov m1, m5
54+
; FINE-GRAINED-NEXT: movs dj1, m5; mov dn1, r3
55+
; FINE-GRAINED-NEXT: movs dc1, dc0; vmov lfl1, lfl0
56+
; FINE-GRAINED-NEXT: movs dn5, r3; vmov lfh1, lfh0
57+
; FINE-GRAINED-NEXT: mova p0, #0; movs dj5, m5; mov dc5, r19
58+
; FINE-GRAINED-NEXT: paddb.3d [p0], d1
59+
; FINE-GRAINED-NEXT: mova p0, #0; mov r19, dc5
60+
; FINE-GRAINED-NEXT: .LBB0_2: // %for.body125.i
61+
; FINE-GRAINED-NEXT: // Parent Loop BB0_1 Depth=1
62+
; FINE-GRAINED-NEXT: // => This Inner Loop Header: Depth=2
63+
; FINE-GRAINED-NEXT: nopa ; nopb ; nopx ; mov dc6, dc0
64+
; FINE-GRAINED-NEXT: mov dn2, r3
65+
; FINE-GRAINED-NEXT: movs dc2, dc0; mov dj2, r0
66+
; FINE-GRAINED-NEXT: movs m2, r8; mov dj6, r13
67+
; FINE-GRAINED-NEXT: movs dn6, r1; mov r25, r18
68+
; FINE-GRAINED-NEXT: movs p1, p0; vmov lfl1, x2
69+
; FINE-GRAINED-NEXT: .L_LEnd0:
70+
; FINE-GRAINED-NEXT: nopa ; vldb.pop.576.3d ex0, [p1, lf1, r25, d2]; nops ; nopx ; vmov lfh1, x3; nopv
71+
; FINE-GRAINED-NEXT: // %bb.3: // %for.cond.cleanup124.i
72+
; FINE-GRAINED-NEXT: // in Loop: Header=BB0_1 Depth=1
73+
; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; nopx ; mov m0, m5; nopv
74+
; FINE-GRAINED-NEXT: movs dn0, m5; nopx ; mov m1, m3
75+
; FINE-GRAINED-NEXT: movs dn1, dn3; mov dj1, dj3
76+
; FINE-GRAINED-NEXT: mova p0, #0; movs dn5, dn7; mov dj5, dj7
77+
; FINE-GRAINED-NEXT: movs dc0, r5; paddb.3d [p0], d3; mov dj7, r21
78+
; FINE-GRAINED-NEXT: movs dj3, r22; mov dn3, m5
79+
; FINE-GRAINED-NEXT: movs m3, m5; mov dn7, m5
80+
; FINE-GRAINED-NEXT: movs dc1, dc3; xor r17, r12, r7; mov dc5, dc7
81+
; FINE-GRAINED-NEXT: movs dc3, r20; and r17, r17, r7; mov dc7, dc4
82+
; FINE-GRAINED-NEXT: mova p1, #0; movs dc4, m5; jnz r17, #.LBB0_1
83+
; FINE-GRAINED-NEXT: movs m3, m1; paddb.3d [p1], d3; mov dn3, dn1 // Delay Slot 5
84+
; FINE-GRAINED-NEXT: mova p0, #0; movs dj3, dj1; mov dn7, dn5 // Delay Slot 4
85+
; FINE-GRAINED-NEXT: movs dj7, dj5; paddb.3d [p0], d0; mov r20, dc3 // Delay Slot 3
86+
; FINE-GRAINED-NEXT: movs dc4, m5; mov dc3, dc1 // Delay Slot 2
87+
; FINE-GRAINED-NEXT: mova dc0, #0; movs dc7, dc5; mov r5, dc0 // Delay Slot 1
88+
; FINE-GRAINED-NEXT: // %bb.4: // %ret.exit
89+
; FINE-GRAINED-NEXT: lda p6, [sp, #-192] // 4-byte Folded Reload
90+
; FINE-GRAINED-NEXT: lda r15, [sp, #-188] // 4-byte Folded Reload
91+
; FINE-GRAINED-NEXT: lda r14, [sp, #-184] // 4-byte Folded Reload
92+
; FINE-GRAINED-NEXT: lda lr, [sp, #-156] // 4-byte Folded Reload
93+
; FINE-GRAINED-NEXT: lda r13, [sp, #-180] // 4-byte Folded Reload
94+
; FINE-GRAINED-NEXT: lda r12, [sp, #-176] // 4-byte Folded Reload
95+
; FINE-GRAINED-NEXT: lda r11, [sp, #-172] // 4-byte Folded Reload
96+
; FINE-GRAINED-NEXT: lda r10, [sp, #-168] // 4-byte Folded Reload
97+
; FINE-GRAINED-NEXT: lda r9, [sp, #-164] // 4-byte Folded Reload
98+
; FINE-GRAINED-NEXT: lda r8, [sp, #-160] // 4-byte Folded Reload
99+
; FINE-GRAINED-NEXT: ret lr
100+
; FINE-GRAINED-NEXT: nop // Delay Slot 5
101+
; FINE-GRAINED-NEXT: nop // Delay Slot 4
102+
; FINE-GRAINED-NEXT: nop // Delay Slot 3
103+
; FINE-GRAINED-NEXT: paddxm [sp], #-192 // Delay Slot 2
104+
; FINE-GRAINED-NEXT: nop // Delay Slot 1
105+
;
15106
; COARSE-GRAINED-LABEL: heavy_3d_user:
16107
; COARSE-GRAINED: // %bb.0: // %entry
17-
; COARSE-GRAINED-NEXT: nopa ; nopb ; paddxm [sp], #384; nops
18-
; COARSE-GRAINED-NEXT: mova m0, #-388; st r9, [sp, #-356]; mov p1, sp // 4-byte Folded Spill
19-
; COARSE-GRAINED-NEXT: mova m0, #-392; paddb [p1], m0; st r10, [sp, #-360] // 4-byte Folded Spill
20-
; COARSE-GRAINED-NEXT: lda dj0, [p1, #0]; st r11, [sp, #-364]; mov p1, sp // 4-byte Folded Spill
21-
; COARSE-GRAINED-NEXT: mova m0, #-400; paddb [p1], m0; st r12, [sp, #-368] // 4-byte Folded Spill
22-
; COARSE-GRAINED-NEXT: lda dj4, [p1, #0]; st r13, [sp, #-372]; mov p1, sp // 4-byte Folded Spill
23-
; COARSE-GRAINED-NEXT: padda [p1], m0; st r14, [sp, #-376] // 4-byte Folded Spill
24-
; COARSE-GRAINED-NEXT: lda m0, [p1, #0]; st r15, [sp, #-380] // 4-byte Folded Spill
108+
; COARSE-GRAINED-NEXT: nopa ; nopb ; nops ; paddxm [sp], #384; nopv
109+
; COARSE-GRAINED-NEXT: st r9, [sp, #-356]; nopb ; nopx // 4-byte Folded Spill
110+
; COARSE-GRAINED-NEXT: st r10, [sp, #-360] // 4-byte Folded Spill
111+
; COARSE-GRAINED-NEXT: st r11, [sp, #-364] // 4-byte Folded Spill
112+
; COARSE-GRAINED-NEXT: mova m0, #-388; st r12, [sp, #-368]; mov p1, sp // 4-byte Folded Spill
113+
; COARSE-GRAINED-NEXT: padda [p1], m0; st r13, [sp, #-372] // 4-byte Folded Spill
114+
; COARSE-GRAINED-NEXT: lda dj0, [p1], #-4; st r14, [sp, #-376] // 4-byte Folded Spill
115+
; COARSE-GRAINED-NEXT: st r15, [sp, #-380] // 4-byte Folded Spill
25116
; COARSE-GRAINED-NEXT: st p6, [sp, #-384] // 4-byte Folded Spill
26-
; COARSE-GRAINED-NEXT: mova r16, #0; st lr, [sp, #-348] // 4-byte Folded Spill
117+
; COARSE-GRAINED-NEXT: lda dj4, [p1], #-4; st lr, [sp, #-348]; movx r16, #0 // 4-byte Folded Spill
27118
; COARSE-GRAINED-NEXT: st r8, [sp, #-352]; vbcst.32 x0, r16 // 4-byte Folded Spill
28-
; COARSE-GRAINED-NEXT: st r0, [sp, #-248]; mov p6, p0 // 4-byte Folded Spill
29-
; COARSE-GRAINED-NEXT: vst x0, [sp, #-128]; mov p1, sp // 64-byte Folded Spill
119+
; COARSE-GRAINED-NEXT: st r0, [sp, #-248] // 4-byte Folded Spill
120+
; COARSE-GRAINED-NEXT: lda m0, [p1, #-4]; vst x0, [sp, #-128]; mov p6, p0 // 64-byte Folded Spill
30121
; COARSE-GRAINED-NEXT: st dj0, [sp, #-304] // 4-byte Folded Spill
31-
; COARSE-GRAINED-NEXT: mova m0, #-396; st m0, [sp, #-280] // 4-byte Folded Spill
32-
; COARSE-GRAINED-NEXT: padda [p1], m0; st dj0, [sp, #-272]; vmov x1, x0 // 4-byte Folded Spill
33-
; COARSE-GRAINED-NEXT: lda r8, [p1, #0]; st dj0, [sp, #-336]; mov p3, #0 // 4-byte Folded Spill
122+
; COARSE-GRAINED-NEXT: st dj0, [sp, #-272] // 4-byte Folded Spill
123+
; COARSE-GRAINED-NEXT: st dj0, [sp, #-336]; vmov x1, x0 // 4-byte Folded Spill
124+
; COARSE-GRAINED-NEXT: lda r8, [p1, #0]; st dj4, [sp, #-288]; mov p3, #0 // 4-byte Folded Spill
34125
; COARSE-GRAINED-NEXT: vst x1, [sp, #-64]; jl p3 // 64-byte Folded Spill
35-
; COARSE-GRAINED-NEXT: mova p2, #0; st dj4, [sp, #-288] // 4-byte Folded Spill Delay Slot 5
36-
; COARSE-GRAINED-NEXT: mova dj4, #1; st dj4, [sp, #-256]; mov r9, r1 // 4-byte Folded Spill Delay Slot 4
126+
; COARSE-GRAINED-NEXT: mova p2, #0; st dj4, [sp, #-256] // 4-byte Folded Spill Delay Slot 5
127+
; COARSE-GRAINED-NEXT: mova dj4, #1; st m0, [sp, #-280]; mov r9, r1 // 4-byte Folded Spill Delay Slot 4
37128
; COARSE-GRAINED-NEXT: mova m0, #0; st dj4, [sp, #-320]; or r10, r2, r2; mov r11, r3 // 4-byte Folded Spill Delay Slot 3
38129
; COARSE-GRAINED-NEXT: mova p0, #0; st m0, [sp, #-344]; or r12, r4, r4; mov r13, r5 // 4-byte Folded Spill Delay Slot 2
39130
; COARSE-GRAINED-NEXT: mova p1, #0; or r14, r6, r6; mov r15, r7 // Delay Slot 1

0 commit comments

Comments
 (0)