Skip to content

Commit 69e707c

Browse files
author
Martien de Jong
committed
[AIE] Integrate LatencyAware and SWPAware
The enable flags are now integers, with 0 meaning false, 1 meaning true, anything else meaning auto-select based on LoopClass. This selection avoids running swpaware if latencyaware has run Add some tuning based on LoopClass
1 parent 44ed576 commit 69e707c

File tree

9 files changed

+264
-36
lines changed

9 files changed

+264
-36
lines changed

llvm/lib/Target/AIE/AIELoopClass.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ static const KernelFeatures Kernels[] = {
7676
{43, {{900, 300, 0, 2460, 0, 0, 3360}, {30, 30, 0, 30, 30}}},
7777
{44, {{900, 300, 0, 2520, 0, 0, 3360}, {30, 30, 0, 30, 30}}},
7878
{45, {{900, 300, 0, 2520, 0, 0, 3360}, {45, 45, 0, 45, 45}}},
79+
{46, {{0, 0, 0, 0, 2160, 0, 120, 1080}, {0, 420, 420}}},
80+
{47, {{0, 0, 0, 0, 360, 0, 240, 360}, {0, 60, 60}}},
7981
};
8082

8183
std::vector<int> getLoopClassScores(const SlotStatistics &Stats) {

llvm/lib/Target/AIE/AIEWawRegRewriter.cpp

Lines changed: 80 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ using namespace llvm;
4949
// This might be compatible with a future extension of the DEBUG rigging
5050
#define DEBUG_DETAIL(x) DEBUG_WITH_TYPE("aie-waw-reg-rewrite:2", x)
5151

52+
enum class RewriteMode {
53+
Basic,
54+
Automatic,
55+
LatencyAware,
56+
SWPAware,
57+
SWPAwareAutoBias,
58+
};
59+
5260
static cl::opt<bool> AggressiveReAlloc(
5361
"aie-aggressive-realloc", cl::Hidden, cl::init(false),
5462
cl::desc("Aggressively de-allocate live-through registers to favor "
@@ -66,16 +74,21 @@ static cl::opt<unsigned>
6674
"considered for WAW rewriting"),
6775
cl::init(3));
6876

69-
static cl::opt<bool>
70-
LatencyAware("aie-realloc-latencyaware", cl::Hidden, cl::init(true),
71-
cl::desc("Enable latency-aware allocation strategy"));
72-
73-
static cl::opt<bool>
74-
SWPAware("aie-realloc-swp-aware", cl::Hidden, cl::init(false),
75-
cl::desc("Use assignment order based on interleaved swp stages"));
77+
static cl::opt<RewriteMode> RegRewriteMode(
78+
"aie-reg-rewrite-mode", cl::Hidden, cl::init(RewriteMode::Automatic),
79+
cl::desc("Set the rewriting mode"),
80+
cl::values(clEnumValN(RewriteMode::Basic, "basic", "Basic"),
81+
clEnumValN(RewriteMode::Automatic, "auto",
82+
"Automatic selection based on loop class"),
83+
clEnumValN(RewriteMode::LatencyAware, "latencyaware",
84+
"Latency aware"),
85+
clEnumValN(RewriteMode::SWPAware, "swpaware",
86+
"SWP aware with default bias"),
87+
clEnumValN(RewriteMode::SWPAwareAutoBias, "swpaware-auto",
88+
"SWP aware with automatic bias")));
7689

7790
static cl::opt<int> MinIIBias("aie-realloc-ii-bias", cl::Hidden, cl::init(0),
78-
cl::desc("MinII bias for swp-aware"));
91+
cl::desc("Set default MinII bias for swpaware"));
7992

8093
namespace {
8194

@@ -89,6 +102,44 @@ using OriginalAllocation =
89102
// Record success for a whole register class.
90103
using RegClassSuccess = std::map<const TargetRegisterClass *, bool>;
91104

105+
RewriteMode selectMode(RewriteMode Mode, int LoopClass) {
106+
if (Mode != RewriteMode::Automatic) {
107+
return Mode;
108+
}
109+
switch (LoopClass) {
110+
case 14:
111+
case 29:
112+
case 47:
113+
return RewriteMode::SWPAwareAutoBias;
114+
default:
115+
return RewriteMode::LatencyAware;
116+
}
117+
}
118+
119+
bool runSWPAware(RewriteMode Mode, int LoopClass, int &Bias) {
120+
Bias = MinIIBias;
121+
switch (Mode) {
122+
case RewriteMode::SWPAwareAutoBias:
123+
break;
124+
case RewriteMode::SWPAware:
125+
return true;
126+
default:
127+
return false;
128+
}
129+
130+
switch (LoopClass) {
131+
case 14:
132+
Bias = -1;
133+
return true;
134+
case 18:
135+
case 29:
136+
Bias = 1;
137+
return true;
138+
default:
139+
return true;
140+
}
141+
}
142+
92143
///
93144
/// This pass rewrites physical register assignments in critical parts of the
94145
/// code (like loops) to break WAW and WAR dependencies.
@@ -141,7 +192,8 @@ class AIEWawRegRewriter : public MachineFunctionPass {
141192
const std::map<const TargetRegisterClass *, bool> &RegClasses);
142193

143194
/// Sort the candidates to mimic interleaving the pipeline stages
144-
void sortSWPAware(OriginalAllocation &Candidates, MachineBasicBlock &MBB);
195+
void sortSWPAware(OriginalAllocation &Candidates, MachineBasicBlock &MBB,
196+
const llvm::AIE::SlotStatistics &Statistics, int LoopClass);
145197

146198
/// Pre-allocate all virtual registers in Candidates. The sole purpose of
147199
/// this is to prime the LRURegisters, so that the end of the loop is
@@ -383,21 +435,17 @@ RoundRobin AIEWawRegRewriter::computeLRURegisters(
383435
return LRURegisters;
384436
}
385437

386-
void AIEWawRegRewriter::sortSWPAware(OriginalAllocation &Candidates,
387-
MachineBasicBlock &MBB) {
388-
438+
void AIEWawRegRewriter::sortSWPAware(
439+
OriginalAllocation &Candidates, MachineBasicBlock &MBB,
440+
const llvm::AIE::SlotStatistics &Statistics, int Bias) {
389441
// We estimate the length of the schedule based on latencies and the
390442
// minimum II based on slots. We then estimate the modulo cycle of each
391443
// instruction based on its depth and apply LRU in the order of the modulo
392444
// cycle.
393445
// Note that both the depth and the II are underestimations since we don't
394446
// account for them interfering. Hence the modulo cycle estimate won't be
395447
// too far off.
396-
AIE::SlotStatistics Statistics = AIE::computeSlotStatistics(MBB, TII);
397-
DEBUG_DETAIL(dbgs() << "Stats="; Statistics.dumpShort(); dbgs() << "\n");
398-
DEBUG_DETAIL(dbgs() << "LoopClass=" << llvm::AIE::classifyLoop(Statistics)
399-
<< "\n");
400-
const int MinII = std::max(Statistics.getMinII() + MinIIBias, 1);
448+
const int MinII = std::max(Statistics.getMinII() + Bias, 1);
401449

402450
MachineSchedContext Context;
403451
Context.MF = MF;
@@ -456,7 +504,18 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
456504
IndexedMap<const MachineInstr *, VirtReg2IndexFunctor> LastVRegDef =
457505
getLastVRegDef(*MBB);
458506

459-
std::set<MCRegister> HighLatencyRegs = getHighOutputLatencyRegs(MBB);
507+
auto &NonConstMBB = *(const_cast<MachineBasicBlock *>(MBB));
508+
AIE::SlotStatistics Statistics = AIE::computeSlotStatistics(NonConstMBB, TII);
509+
const int LoopClass = llvm::AIE::classifyLoop(Statistics);
510+
LLVM_DEBUG(dbgs() << "Stats="; Statistics.dumpShort(); dbgs() << "\n");
511+
LLVM_DEBUG(dbgs() << "LoopClass=" << LoopClass << "\n");
512+
513+
RewriteMode Mode = selectMode(RegRewriteMode, LoopClass);
514+
515+
std::set<MCRegister> HighLatencyRegs;
516+
if (Mode == RewriteMode::LatencyAware) {
517+
HighLatencyRegs = getHighOutputLatencyRegs(MBB);
518+
}
460519

461520
OriginalAllocation Candidates;
462521

@@ -533,9 +592,9 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
533592
}
534593
}
535594

536-
if (SWPAware) {
537-
auto &NCMBB = *(const_cast<MachineBasicBlock *>(MBB));
538-
sortSWPAware(Candidates, NCMBB);
595+
int Bias = MinIIBias;
596+
if (runSWPAware(Mode, LoopClass, Bias)) {
597+
sortSWPAware(Candidates, NonConstMBB, Statistics, Bias);
539598
}
540599

541600
// Least-Recently-Used list of physical registers for assignments to VRegs.
@@ -730,10 +789,6 @@ AIEWawRegRewriter::getLastVRegDef(const MachineBasicBlock &MBB) const {
730789

731790
std::set<MCRegister> AIEWawRegRewriter::getHighOutputLatencyRegs(
732791
const MachineBasicBlock *MBB) const {
733-
734-
if (!LatencyAware)
735-
return {};
736-
737792
auto *ItinData = MF->getSubtarget().getInstrItineraryData();
738793
std::set<MCRegister> HighLatRegisters;
739794
for (const MachineInstr &MI : *MBB) {

llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
; See https://llvm.org/LICENSE.txt for license information.
55
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
;
7-
; (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
7+
; (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
88

9-
; RUN: llc -O2 -mtriple=aie2 %s -o - | FileCheck %s
9+
; RUN: llc -O2 -mtriple=aie2 \
10+
; RUN: --aie-reg-rewrite-mode=latencyaware \
11+
; RUN: %s -o - | FileCheck %s
1012

1113
; The test is meant as a quick way to spot QoR regressions. In this test, the
1214
; code can only be pipelined (Pre-SWP) because of the removal of some WAW

llvm/test/CodeGen/AIE/aie2p/end-to-end/add-att-broadcasting.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
;
77
; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
8-
; RUN: llc -mtriple=aie2p --aie-realloc-latencyaware=true %s -o - | FileCheck %s
8+
; RUN: llc -mtriple=aie2p --aie-reg-rewrite-mode=latencyaware %s -o - | FileCheck %s
99

1010
; Test postSWP capabilities related to AddAttributeBroadcasting
1111

llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d-dw-bf16.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
;
77
; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
8-
; RUN: llc -mtriple=aie2p --aie-realloc-latencyaware=true %s -o - | FileCheck %s
8+
; RUN: llc -mtriple=aie2p --aie-reg-rewrite-mode=latencyaware %s -o - | FileCheck %s
99

1010
; Test postSWP capabilities related to conv2d_dw_bf16.
1111

llvm/test/CodeGen/AIE/aie2p/end-to-end/gemm-bfp16.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
;
77
; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
8-
; RUN: llc -mtriple=aie2p --aie-realloc-latencyaware=true %s -o - | FileCheck %s
8+
; RUN: llc -mtriple=aie2p --aie-reg-rewrite-mode=latencyaware %s -o - | FileCheck %s
99

1010
; Test postSWP capabilities related to gemm_bfp16.
1111

llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming.mir

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
#
88
# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
99

10-
# Basic test for the WAW register renaming pass. Check AIE2 tests for more coverage.
11-
# Additional pass options are disabled because we are testing the basic mechanism.
10+
# Basic test for the WAW register renaming pass. Check AIE2 tests for more
11+
# coverage. Additional pass options are disabled because we are testing the
12+
# basic mechanism.
1213

13-
# RUN: llc -mtriple=aie2p -verify-machineinstrs --aie-realloc-latencyaware=false --start-before=greedy \
14-
# RUN: --stop-after=virtregrewriter %s -o - | FileCheck %s
14+
# RUN: llc -mtriple=aie2p -verify-machineinstrs --aie-reg-rewrite-mode=basic \
15+
# RUN: --start-before=greedy --stop-after=virtregrewriter \
16+
# RUN: %s -o - | FileCheck %s
1517

1618
# Make sure VLD and VMAX define different X registers.
1719
---

llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming_latencyaware.mir

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
# try to not allocate it as def of any other instruction. In the example below, no
1414
# other instructions should output to the same registers used by the loads.
1515

16-
# RUN: llc -mtriple=aie2p -verify-machineinstrs --aie-realloc-latencyaware=true\
17-
# RUN: --start-before=greedy --stop-after=virtregrewriter %s -o - | FileCheck %s
16+
# RUN: llc -mtriple=aie2p -verify-machineinstrs \
17+
# RUN: --aie-reg-rewrite-mode=latencyaware \
18+
# RUN: --start-before=greedy --stop-after=virtregrewriter \
19+
# RUN: %s -o - | FileCheck %s
1820

1921
---
2022
name: conv2d_dw_bf16

0 commit comments

Comments
 (0)