Skip to content

Commit 8382e76

Browse files
Merge commit '6958807390a43dca6f68cf9039aed9f4c72c700d'
2 parents 7d4f1ce + 6958807 commit 8382e76

File tree

27 files changed

+764
-185
lines changed

27 files changed

+764
-185
lines changed

.github/workflows/llvm-build.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ jobs:
2828
config:
2929
- {runner: 'Ubuntu 20.04', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'x64'}
3030
- {runner: 'Ubuntu 20.04 ARM64', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'arm64'}
31+
- {runner: 'CentOS 7', runs_on: ['self-hosted', 'CPU'], target-os: 'centos', arch: 'x64'}
3132
- {runner: 'AlmaLinux 8', runs_on: ['self-hosted', 'CPU'], target-os: 'almalinux', arch: 'x64'}
3233
- {runner: 'MacOS X64', runs_on: 'macos-12', target-os: 'macos', arch: 'x64'}
3334
- {runner: 'MacOS ARM64', runs_on: 'macos-12', target-os: 'macos', arch: 'arm64'}
@@ -233,6 +234,30 @@ jobs:
233234
234235
tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
235236
237+
238+
- name: Configure, Build, Test, and Install LLVM (CentOS)
239+
if: matrix.config.target-os == 'centos'
240+
run: |
241+
# if this step crashes, it can leave behind a stale docker container
242+
docker container prune -f
243+
docker rmi -f $(docker images -q)
244+
245+
docker build --tag llvm-build --build-arg llvm_dir=llvm-project \
246+
-f llvm-build/.github/workflows/llvm-build/centos.Dockerfile .
247+
248+
# Create temporary container to copy cache and installed artifacts.
249+
CONTAINER_ID=$(docker create llvm-build)
250+
docker cp "${CONTAINER_ID}:/install" "${{ env.llvm_install_dir }}"
251+
tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
252+
253+
# We remove the existing directory, otherwise docker will
254+
# create a subdirectory inside the existing directory.
255+
rm -rf "${{ env.SCCACHE_DIR }}"
256+
docker cp "${CONTAINER_ID}:/sccache" "${{ env.SCCACHE_DIR }}"
257+
sudo chown -R "$(id -u -n):$(id -g -n)" "${{ env.SCCACHE_DIR }}"
258+
259+
docker rm "${CONTAINER_ID}"
260+
236261
- name: Configure, Build, Test, and Install LLVM (AlmaLinux)
237262
if: matrix.config.target-os == 'almalinux'
238263
run: |
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
FROM centos:7
2+
ARG llvm_dir=llvm-project
3+
# Add the cache artifacts and the LLVM source tree to the container
4+
ADD sccache /sccache
5+
ADD "${llvm_dir}" /source/llvm-project
6+
ENV SCCACHE_DIR="/sccache"
7+
ENV SCCACHE_CACHE_SIZE="2G"
8+
9+
RUN echo -e "[llvmtoolset-build]\nname=LLVM Toolset 13.0 - Build\nbaseurl=https://buildlogs.centos.org/c7-llvm-toolset-13.0.x86_64/\ngpgcheck=0\nenabled=1" > /etc/yum.repos.d/llvmtoolset-build.repo
10+
11+
# Note: This is required patch since CentOS have reached EOL
12+
# otherwise any yum install setp will fail
13+
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
14+
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
15+
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
16+
17+
# Install build dependencies
18+
RUN yum install --assumeyes centos-release-scl
19+
20+
# The definition of insanity is doing the same thing and expecting a different result
21+
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
22+
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
23+
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
24+
25+
RUN yum install --assumeyes --nogpgcheck llvm-toolset-13.0
26+
RUN yum install --assumeyes rh-python38-python-devel rh-python38-python-pip
27+
SHELL [ "/usr/bin/scl", "enable", "llvm-toolset-13.0", "rh-python38" ]
28+
29+
RUN python3 -m pip install --upgrade pip
30+
RUN python3 -m pip install --upgrade cmake ninja sccache
31+
32+
# Install MLIR's Python Dependencies
33+
RUN python3 -m pip install -r /source/llvm-project/mlir/python/requirements.txt
34+
35+
# Configure, Build, Test, and Install LLVM
36+
RUN cmake -GNinja -Bbuild \
37+
-DCMAKE_BUILD_TYPE=Release \
38+
-DCMAKE_C_COMPILER=clang \
39+
-DCMAKE_CXX_COMPILER=clang++ \
40+
-DCMAKE_ASM_COMPILER=clang \
41+
-DCMAKE_C_COMPILER_LAUNCHER=sccache \
42+
-DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
43+
-DCMAKE_CXX_FLAGS="-Wno-everything" \
44+
-DCMAKE_LINKER=lld \
45+
-DCMAKE_INSTALL_PREFIX="/install" \
46+
-DLLVM_BUILD_UTILS=ON \
47+
-DLLVM_BUILD_TOOLS=ON \
48+
-DLLVM_ENABLE_ASSERTIONS=ON \
49+
-DMLIR_ENABLE_BINDINGS_PYTHON=ON \
50+
-DLLVM_ENABLE_PROJECTS=mlir \
51+
-DLLVM_ENABLE_TERMINFO=OFF \
52+
-DLLVM_INSTALL_UTILS=ON \
53+
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
54+
/source/llvm-project/llvm
55+
56+
RUN ninja -C build install

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ def TT_AddPtrOp : TT_Op<"addptr",
211211
let results = (outs TT_PtrLike:$result);
212212

213213
let assemblyFormat = "$ptr `,` $offset attr-dict `:` type($result) `,` type($offset)";
214+
let hasFolder = 1;
214215
}
215216

216217
def TT_AdvanceOp : TT_Op<"advance",

include/triton/Dialect/TritonGPU/Transforms/Utility.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,13 @@ int getNVIDIAComputeCapability(Operation *module);
195195
std::optional<mlir::triton::gpu::SharedEncodingAttr>
196196
getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);
197197

198-
bool loadIsMMAv3(Operation *loadOp);
198+
enum class MMALoadType {
199+
SharedV3,
200+
Registers, // may be v2 or v3
201+
DoNotPipeline, // could be a valid shared/registers MMA operand, but skip
202+
// pipelining
203+
};
204+
MMALoadType getMMALoadType(Operation *loadOp);
199205
} // namespace mlir
200206

201207
#endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_

lib/Dialect/Triton/IR/Ops.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,15 @@ void MakeTensorPtrOp::build(OpBuilder &builder, OperationState &state,
848848
builder.getDenseI32ArrayAttr(order));
849849
}
850850

851+
//-- AddPtrOp --
852+
OpFoldResult AddPtrOp::fold(FoldAdaptor adaptor) {
853+
// addptr(ptr, 0) -> ptr
854+
if (matchPattern(adaptor.getOffset(), m_Zero())) {
855+
return getPtr();
856+
}
857+
return {};
858+
}
859+
851860
//-- AdvanceOp --
852861
OpFoldResult AdvanceOp::fold(FoldAdaptor adaptor) {
853862
// advance(ptr, 0, 0) -> ptr

lib/Dialect/Triton/Transforms/Combine.cpp

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include "mlir/Support/LLVM.h"
88
#include "mlir/Support/LogicalResult.h"
99
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
10-
#include "triton/Analysis/Utility.h"
1110
#include "triton/Dialect/Triton/IR/Dialect.h"
1211
#include "triton/Dialect/Triton/Transforms/Passes.h"
1312

@@ -18,35 +17,7 @@ namespace mlir::triton {
1817
namespace {
1918

2019
bool isZero(Value val) {
21-
if (matchPattern(val, m_Zero()) || matchPattern(val, m_AnyZeroFloat()))
22-
return true;
23-
// broadcast(constant_0)
24-
if (auto bc = val.getDefiningOp<BroadcastOp>()) {
25-
if (matchPattern(bc.getSrc(), m_Zero()) ||
26-
matchPattern(bc.getSrc(), m_AnyZeroFloat()))
27-
return true;
28-
}
29-
return false;
30-
}
31-
32-
bool isBroadcastConstantCombinable(Attribute value) {
33-
if (auto denseValue = dyn_cast<DenseElementsAttr>(value)) {
34-
return denseValue.isSplat();
35-
}
36-
return isa<FloatAttr, IntegerAttr>(value);
37-
}
38-
39-
DenseElementsAttr getConstantValue(Builder &builder, Attribute value,
40-
Value bcast_res) {
41-
auto resType = cast<ShapedType>(bcast_res.getType());
42-
DenseElementsAttr res;
43-
if (auto denseValue = dyn_cast<DenseElementsAttr>(value)) {
44-
res =
45-
DenseElementsAttr::get(resType, denseValue.getSplatValue<Attribute>());
46-
} else {
47-
res = DenseElementsAttr::get(resType, value);
48-
}
49-
return res;
20+
return (matchPattern(val, m_Zero()) || matchPattern(val, m_AnyZeroFloat()));
5021
}
5122

5223
bool isAddPtrOffsetCombinable(Value first, Value second) {
@@ -231,7 +202,6 @@ class CombineOpsPass : public TritonCombineOpsBase<CombineOpsPass> {
231202
// %}
232203
patterns.add<CombineSelectMaskedLoadPattern>(context);
233204
patterns.add<CombineAddPtrPattern>(context);
234-
patterns.add<CombineBroadcastConstantPattern>(context);
235205
patterns.add<CombineBroadcastMulReducePattern>(context);
236206

237207
if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed())

lib/Dialect/Triton/Transforms/Combine.td

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,4 @@ def CombineAddPtrPattern : Pat<
4444
(TT_AddPtrOp $ptr, (Arith_AddIOp $idx0, $idx1, DefOverflow)),
4545
[(Constraint<CPred<"isAddPtrOffsetCombinable($0, $1)">> $idx0, $idx1)]>;
4646

47-
// broadcast(cst) => cst
48-
def getConstantValue : NativeCodeCall<"getConstantValue($_builder, $0, $1)">;
49-
def CombineBroadcastConstantPattern : Pat<
50-
(TT_BroadcastOp:$bcast_res (Arith_ConstantOp $value)),
51-
(Arith_ConstantOp (getConstantValue $value, $bcast_res), (location $bcast_res)),
52-
[(Constraint<CPred<"isBroadcastConstantCombinable($0)">> $value)]>;
53-
5447
#endif

lib/Dialect/Triton/Transforms/ReorderBroadcast.cpp

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -206,18 +206,6 @@ struct MoveBroadcastAfterElementwisePattern
206206
}
207207
};
208208

209-
template <typename OpType>
210-
class CanonicalizePattern : public OpRewritePattern<OpType> {
211-
public:
212-
explicit CanonicalizePattern(MLIRContext *context)
213-
: OpRewritePattern<OpType>(context) {}
214-
215-
LogicalResult matchAndRewrite(OpType op,
216-
PatternRewriter &rewriter) const override {
217-
return OpType::canonicalize(op, rewriter);
218-
}
219-
};
220-
221209
class ReorderBroadcastPass
222210
: public ::impl::TritonReorderBroadcastBase<ReorderBroadcastPass> {
223211
public:
@@ -226,8 +214,8 @@ class ReorderBroadcastPass
226214
RewritePatternSet patterns(context);
227215
ModuleOp m = getOperation();
228216

229-
patterns.add<CanonicalizePattern<BroadcastOp>>(context);
230-
patterns.add<CanonicalizePattern<ExpandDimsOp>>(context);
217+
BroadcastOp::getCanonicalizationPatterns(patterns, context);
218+
ExpandDimsOp::getCanonicalizationPatterns(patterns, context);
231219
// elementwise(broadcast(a)) => broadcast(elementwise(a))
232220
patterns.add<MoveBroadcastAfterElementwisePattern>(context);
233221
// elementwise(splat(a), splat(b), ...) => splat(elementwise(a, b, ...))

lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,18 @@ filterPipelinedLoad(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
145145

146146
bool hasSharedEncoding = false;
147147
if (use->hasTrait<OpTrait::DotLike>()) {
148-
if (loadIsMMAv3(op)) {
148+
auto mmaLoadType = getMMALoadType(op);
149+
auto dot = dyn_cast<tt::DotOp>(use);
150+
auto warpGroupDot = dyn_cast<ttng::WarpGroupDotOp>(use);
151+
bool isMMAv3Shared = mmaLoadType == MMALoadType::SharedV3;
152+
bool isMMAv3Registers =
153+
(mmaLoadType == MMALoadType::Registers) && warpGroupDot;
154+
155+
if (isMMAv3Shared) {
149156
hasSharedEncoding = true;
150157
} else if (isa<tt::ExperimentalDescriptorLoadOp>(op)) {
151158
hasSharedEncoding = true;
152-
} else if (auto dot = dyn_cast<tt::DotOp>(use)) {
159+
} else if (isMMAv3Registers || dot) {
153160
// FIXME: if we have a better solution in handling incompatible shared
154161
// encoding, we can simplify the logic here by checking if all users are
155162
// dot encoding. Fow now, getSharedEncIfAllUsersAreDotEnc will be used

0 commit comments

Comments
 (0)