Skip to content

Commit 20a1637

Browse files
authored
Merge branch 'main' into chengjun/trans_2d_load
2 parents dd08a34 + 3c154c8 commit 20a1637

File tree

114 files changed

+4578
-2083
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+4578
-2083
lines changed

.github/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](ht
1414
* [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html)
1515
* [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html)
1616
* [Intel® Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html)
17+
* [Intel® Arc B580](https://www.intel.com/content/www/us/en/products/sku/241598/intel-arc-b580-graphics/specifications.html)
1718
* GPU Drivers:
1819
* Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html)
1920
* Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html)

.github/workflows/integration-tests-amd.yml

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,25 @@ jobs:
1818
runner: ${{ fromJson(inputs.matrix) }}
1919
include:
2020
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
21+
runner: ["self-hosted", "gfx90a"]
22+
# Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
23+
# container expect it at /github/home/.triton. So map here to make sure visible in docker.
24+
options: >-
25+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
26+
--volume /home/runner/.triton:/github/home/.triton
27+
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
28+
runner: ["amd-gfx942"]
29+
# We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
30+
options: >-
31+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
32+
--env-file /etc/podinfo/gha-gpu-isolation-settings
33+
--volume /home/runner/.triton:/github/home/.triton
2134
- image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
2235
runner: ["amd-gfx950"]
36+
options: >-
37+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
38+
--env-file /etc/podinfo/gha-gpu-isolation-settings
39+
--volume /home/runner/.triton:/github/home/.triton
2340
env:
2441
RUNNER_TYPE: ${{ matrix.runner[1] }}
2542
TRITON_BUILD_WITH_CCACHE: "true"
@@ -31,11 +48,7 @@ jobs:
3148
CCACHE_COMPRESS: "true"
3249
container:
3350
image: ${{ matrix.image }}
34-
# Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
35-
# container expect it at /github/home/.triton. So map here to make sure visible in docker.
36-
options: >-
37-
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
38-
--volume /home/runner/.triton:/github/home/.triton
51+
options: ${{ matrix.options }}
3952
steps:
4053
- name: Checkout
4154
uses: actions/checkout@v4
@@ -96,6 +109,8 @@ jobs:
96109
run: ccache --print-stats
97110
- name: Run lit tests
98111
run: make test-lit
112+
- name: Run C++ unittests
113+
run: make test-cpp
99114
- name: Run python tests on AMD
100115
run: |
101116
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
@@ -147,13 +162,13 @@ jobs:
147162
python3 -m pytest -s -n 8 ./test_cast_matmul.py
148163
- name: Run Proton tests
149164
run: |
165+
unset HIP_VISIBLE_DEVICES
166+
unset ROCR_VISIBLE_DEVICES
150167
if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
151168
python3 -m pytest -s -n 8 third_party/proton/test -k "not test_instrument_exec"
152169
else
153170
make test-proton
154171
fi
155-
- name: Run C++ unittests
156-
run: make test-cpp
157172
- name: Inspect cache directories
158173
run: |
159174
mkdir -p ~/.triton
@@ -162,7 +177,8 @@ jobs:
162177
mkdir -p ~/.ccache
163178
du -h -d 1 ~/.ccache
164179
- name: Clean up caches
165-
# Always cleanup the worker, even if builds or tests failed
180+
# Always cleanup the worker, even if builds or tests failed given that these directories are
181+
# mapped from the host and we write files as the root user in the docker.
166182
if: always()
167183
run: |
168184
rm -rf ~/.triton/cache

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ PYTHON ?= python
66
BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')
77
TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
88
PYTEST := $(PYTHON) -m pytest
9-
LLVM_BUILD_PATH ?= $(realpath .llvm-project/build)
9+
LLVM_BUILD_PATH ?= "$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))/.llvm-project/build"
1010
NUM_PROCS ?= 8
1111

1212
# Incremental builds

include/triton/Analysis/Utility.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,6 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy);
252252
// warps, and possibly blocks.
253253
bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
254254

255-
bool atomicNeedsSharedMemory(Value result);
256-
257-
// Check if MFMA layout can be converted to the dot operand
258-
// layout using warp shuffle.
259-
bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
260-
RankedTensorType dstTy);
261-
262255
// TODO: Move utility functions that belong to ConvertLayoutOp to class
263256
// ConvertLayoutOpHelper in the future
264257
bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);

include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ class TargetInfoBase {
9797
virtual bool supportLdMatrix() const { return false; }
9898
virtual bool supportStMatrix() const { return false; }
9999
virtual bool isCuda() const { return false; }
100+
virtual bool isXpu() const { return false; }
100101

101102
// Annotate target specific information to local load operations during
102103
// lowering to LLVM. `llLoadOp` is the generated LLVM load op.

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -557,13 +557,14 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
557557
// calcPaddedOffset is a lambda that takes a base offset (mlir::Value)
558558
// and computes a new offset (mlir::Value) by applying padding based on
559559
// shared memory layout.
560-
SmallVector<Value> lowerLdStShared(
561-
Location loc, MLIRContext *ctx, LinearLayout cvt,
562-
ArrayRef<Value> valsArray, // Input for store, output for load
563-
Type llvmElemTy, Value smemBase,
564-
std::function<Value(Value)> calcPaddedOffset, Value affineOffset,
565-
uint64_t maskSpanAffineOffset, ConversionPatternRewriter &rewriter,
566-
const TargetInfoBase &targetInfo, Operation *localLoadOp = nullptr);
560+
SmallVector<Value>
561+
lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
562+
ArrayRef<Value> valsArray, // Input for store, output for load
563+
Type llvmElemTy, Value smemBase,
564+
std::function<Value(Value)> calcPaddedOffset,
565+
Value affineOffset, uint64_t maskSpanAffineOffset,
566+
RewriterBase &rewriter, const TargetInfoBase &targetInfo,
567+
Operation *localLoadOp = nullptr);
567568

568569
// Lower an ld/st-like operation given a layout and a callback that creates the
569570
// PTX instruction Lowers to st when valArrays is empty, and to ld when it is
@@ -576,10 +577,10 @@ SmallVector<Value> lowerLdSt(
576577
ArrayRef<Value> valsArray, // Input for store, output for load
577578
Type llvmElemTy, Value smemBase,
578579
std::function<Value(Value)> calcPaddedOffset, Value affineOffset,
579-
uint64_t maskSpanAffineOffset, ConversionPatternRewriter &rewriter,
580+
uint64_t maskSpanAffineOffset, RewriterBase &rewriter,
580581
const TargetInfoBase &targetInfo, std::optional<int> maybeMaxVecElems,
581-
std::function<SmallVector<Value>(ConversionPatternRewriter &, Location,
582-
ArrayRef<Value>, Value, int, VectorType)>
582+
std::function<SmallVector<Value>(RewriterBase &, Location, ArrayRef<Value>,
583+
Value, int, VectorType)>
583584
lowerInst);
584585

585586
// Lower local_load/local_store via ld.shared/st.shared
@@ -588,7 +589,7 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
588589
LinearLayout cvt, // Map from registers to offset
589590
ArrayRef<Value> valsArray, // Input for store, empty for load
590591
Type llvmElemTy, triton::gpu::MemDescType srcTy,
591-
SharedMemoryObject smemObj, ConversionPatternRewriter &rewriter,
592+
SharedMemoryObject smemObj, RewriterBase &rewriter,
592593
const TargetInfoBase &targetInfo,
593594
Operation *localLoadOp = nullptr);
594595

@@ -643,6 +644,12 @@ Value transferWithinBlockPadding(triton::gpu::ConvertLayoutOp op, Value src,
643644
const LLVMTypeConverter *typeConverter,
644645
RewriterBase &rewriter);
645646

647+
LogicalResult
648+
transferWithinBlockSwizzling(triton::gpu::ConvertLayoutOp op, Value src,
649+
const TargetInfoBase &targetInfo,
650+
const LLVMTypeConverter *typeConverter,
651+
RewriterBase &rewriter);
652+
646653
SmallVector<Value> inlineRegionImpl(RewriterBase &rewriter, Region &region,
647654
ArrayRef<Value> args,
648655
mlir::TypeID terminatorTypeId,
@@ -655,6 +662,13 @@ SmallVector<Value> inlineRegion(RewriterBase &rewriter, Region &region,
655662
mlir::TypeID::get<TerminatorOp>(), loc);
656663
}
657664

665+
void finalizeTensorAtomicResults(Operation *op, RankedTensorType tensorTy,
666+
ConversionPatternRewriter &rewriter,
667+
SmallVector<Value> &resultVals,
668+
Type valueElemTy, TritonLLVMOpBuilder &b,
669+
Value threadPred,
670+
const TargetInfoBase &targetInfo,
671+
const LLVMTypeConverter *typeConverter);
658672
} // namespace mlir
659673

660674
#endif

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1275,7 +1275,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
12751275
let arguments = (ins Variadic<AnyType>:$srcs);
12761276

12771277
let builders = [OpBuilder<(ins), [{
1278-
build($_builder, $_state, std::nullopt);
1278+
build($_builder, $_state, mlir::ValueRange());
12791279
}]>];
12801280

12811281
let assemblyFormat = "attr-dict ($srcs^ `:` type($srcs))?";

include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,6 @@ def TTG_MemDescIndexOp : TTG_Op<"memdesc_index", [Pure, MemDescViewTrait]> {
214214
- the output shape is 4x16xf16, and
215215
- index = 1.
216216
Then the output descriptor is equivalent to input[1], where input is the logical tensor.
217-
218-
When the input is of rank 1 (i.e, shape=[k]), the output will have shape=[1].
219217
}];
220218

221219
let arguments = (ins TTG_MemDescType:$src, I32:$index);

include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ add_mlir_doc(TritonInstrumentDialect TritonInstrumentDialect dialects/ -gen-dial
88
set(LLVM_TARGET_DEFINITIONS TritonInstrumentOps.td)
99
mlir_tablegen(Ops.h.inc -gen-op-decls)
1010
mlir_tablegen(Ops.cpp.inc -gen-op-defs)
11+
mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
12+
mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
1113
add_mlir_doc(TritonInstrumentOps TritonInstrumentOps dialects/ -gen-op-doc)
1214

1315
add_public_tablegen_target(TritonInstrumentTableGen)

0 commit comments

Comments
 (0)