Skip to content
Draft
18 changes: 11 additions & 7 deletions .github/workflows/integration-tests-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
options: >-
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
--volume /home/runner/.triton:/github/home/.triton
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
- image: pmylonamd/rocm7.0_ubuntu22.04_py3.10_pytorch_2.8.0_asan
runner: ["amd-gfx942"]
# We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
options: >-
Expand All @@ -44,6 +44,7 @@ jobs:
PROTON_SKIP_PC_SAMPLING_TEST: 1
PYTHON: "python3"
CCACHE_COMPRESS: "true"
PIP_BREAK_SYSTEM_PACKAGES: 1
container:
image: ${{ matrix.image }}
options: ${{ matrix.options }}
Expand Down Expand Up @@ -157,18 +158,21 @@ jobs:
run: |
make test-distributed
- name: Run asan tests on AMD
if: false
if: ${{ matrix.runner[0] == 'amd-gfx942' }}
run: |
cd third_party/amd/python/test/
ulimit -s 1024
export PATH=$(find ~/.triton/llvm -name llvm-symbolizer -printf '%h\n'):$PATH
export LD_LIBRARY_PATH=$(find /opt -name libclang_rt.asan-x86_64.so -printf '%h\n'):$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$(find /opt -name libclang_rt.asan-x86_64.so -printf '%h\n' | head -n1):$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$(find /opt -type d -wholename *lib/llvm/lib/asan):$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$(find /usr -name libcaffe2_nvrtc.so -printf '%h\n'):$LD_LIBRARY_PATH
export CLANG_ASAN_LIB=$(find /opt -name libclang_rt.asan-x86_64.so)
TORCH_PATH=$(find /opt -name libcaffe2_nvrtc.so -printf '%h\n')
mv $TORCH_PATH/libamdhip64.so $TORCH_PATH/libamdhip64_bck.so
export LD_LIBRARY_PATH=$TORCH_PATH:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$(find /opt -wholename *lib/asan/libamdhip64.so -printf '%h\n'):$LD_LIBRARY_PATH
export CLANG_ASAN_LIB=$(find /opt -name libclang_rt.asan-x86_64.so | head -n1)
export HIP_ASAN_LIB=$(find /opt -wholename *lib/asan/libamdhip64.so)
ASAN_OPTIONS=detect_leaks=0,alloc_dealloc_mismatch=0 \
LD_PRELOAD=$CLANG_ASAN_LIB:$HIP_ASAN_LIB python3 -m pytest -s test_address_sanitizer.py
ASAN_OPTIONS=detect_leaks=0,alloc_dealloc_mismatch=0 LD_PRELOAD=$CLANG_ASAN_LIB:$HIP_ASAN_LIB python3 -m pytest -s test_address_sanitizer.py
mv $TORCH_PATH/libamdhip64_bck.so $TORCH_PATH/libamdhip64.so
- name: Run regression tests
run: |
make test-regression
Expand Down
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ test-unit: all

.PHONY: test-distributed
test-distributed: all
$(PYTHON) -m pip install --upgrade pip
$(PYTHON) -m pip install python/triton_kernels -v
$(PYTEST) -s python/triton_kernels/bench/distributed.py

Expand Down
22 changes: 20 additions & 2 deletions python/test/unit/language/test_tensor_descriptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1190,8 +1190,21 @@ def trunc_to_tf32(tensor):
tf32_simulated = masked_int.view(np.float32)
return tf32_simulated

torch.manual_seed(42)
if dtype_str == "float32":
dtype = torch.float32
elif dtype_str == "float16":
dtype = torch.float16
else: # bfloat16
dtype = torch.bfloat16

# test a layout where block_m and block_N are split into two separate chunks.
A = numpy_random((M, K), dtype_str)
A = torch.rand((M, K), dtype=dtype)
if dtype == torch.bfloat16:
A = A.float().numpy()
else:
A = A.numpy()

if dtype_str == "float32":
A = trunc_to_tf32(A)

Expand All @@ -1204,7 +1217,12 @@ def chunk(X, BLOCK0, BLOCK1):
A = to_triton(A, device=device, dst_type=dtype_str)
A_reshaped = to_triton(A_reshaped, device=device, dst_type=dtype_str)

B = numpy_random((N, K), dtype_str)
B = torch.rand((N, K), dtype=dtype)
if dtype == torch.bfloat16:
B = B.float().numpy()
else:
B = B.numpy()

if dtype_str == "float32":
B = trunc_to_tf32(B)

Expand Down
13 changes: 10 additions & 3 deletions third_party/amd/python/test/address_sanitizer_helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch
import triton
import triton.language as tl
import argparse

size = 4096
x = torch.rand(size, device='cuda')
Expand Down Expand Up @@ -28,6 +29,12 @@ def add_kernel(
tl.store(output_ptr + offsets, output)


pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
amdgcn = pgm.asm['amdgcn']
print(amdgcn)
parser = argparse.ArgumentParser()
parser.add_argument("mode", choices=["warmup", "launch"])
args = parser.parse_args()
if args.mode == "warmup":
add_kernel.warmup(x, y, output, n_elements, BLOCK_SIZE=1024, grid=grid)
else: # launch
pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
amdgcn = pgm.asm['amdgcn']
print(amdgcn)
5 changes: 4 additions & 1 deletion third_party/amd/python/test/test_address_sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ def test_address_sanitizer():
# Disable buffer ops given it has builtin support for out of bound access.
os.environ["AMDGCN_USE_BUFFER_OPS"] = "0"

out = subprocess.Popen(["python", "address_sanitizer_helper.py"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
# Workaround: Code hangs if kernel compilation and kernel launch are done in the same step
subprocess.check_call(["python", "address_sanitizer_helper.py", "warmup"])
out = subprocess.Popen(["python", "address_sanitizer_helper.py", "launch"], stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
assert "Begin function __asan_report" in out.stdout.read().decode()
assert "heap-buffer-overflow" in out.stderr.read().decode()
2 changes: 1 addition & 1 deletion third_party/proton/test/test_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def fn():
assert test_frame is not None
# {torch.ones, add, foo}
if is_hip():
assert len(test_frame["children"]) >= 2
assert len(test_frame["children"]) >= 1
else:
assert len(test_frame["children"]) >= 3
assert test_frame["children"][0]["metrics"]["time (ns)"] > 0
Expand Down
Loading