diff --git a/.github/workflows/integration-tests-amd.yml b/.github/workflows/integration-tests-amd.yml index 2dfe994f9d28..495432a8902b 100644 --- a/.github/workflows/integration-tests-amd.yml +++ b/.github/workflows/integration-tests-amd.yml @@ -22,7 +22,7 @@ jobs: options: >- --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root --volume /home/runner/.triton:/github/home/.triton - - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan + - image: pmylonamd/rocm7.0_ubuntu22.04_py3.10_pytorch_2.8.0_asan runner: ["amd-gfx942"] # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation. options: >- @@ -44,6 +44,7 @@ jobs: PROTON_SKIP_PC_SAMPLING_TEST: 1 PYTHON: "python3" CCACHE_COMPRESS: "true" + PIP_BREAK_SYSTEM_PACKAGES: 1 container: image: ${{ matrix.image }} options: ${{ matrix.options }} @@ -157,18 +158,21 @@ jobs: run: | make test-distributed - name: Run asan tests on AMD - if: false + if: ${{ matrix.runner[0] == 'amd-gfx942' }} run: | cd third_party/amd/python/test/ ulimit -s 1024 export PATH=$(find ~/.triton/llvm -name llvm-symbolizer -printf '%h\n'):$PATH - export LD_LIBRARY_PATH=$(find /opt -name libclang_rt.asan-x86_64.so -printf '%h\n'):$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=$(find /opt -name libclang_rt.asan-x86_64.so -printf '%h\n' | head -n1):$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$(find /opt -type d -wholename *lib/llvm/lib/asan):$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$(find /usr -name libcaffe2_nvrtc.so -printf '%h\n'):$LD_LIBRARY_PATH - export CLANG_ASAN_LIB=$(find /opt -name libclang_rt.asan-x86_64.so) + TORCH_PATH=$(find /opt -name libcaffe2_nvrtc.so -printf '%h\n') + mv $TORCH_PATH/libamdhip64.so $TORCH_PATH/libamdhip64_bck.so + export LD_LIBRARY_PATH=$TORCH_PATH:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=$(find /opt -wholename *lib/asan/libamdhip64.so -printf '%h\n'):$LD_LIBRARY_PATH + export CLANG_ASAN_LIB=$(find /opt -name libclang_rt.asan-x86_64.so | head -n1) export HIP_ASAN_LIB=$(find /opt -wholename *lib/asan/libamdhip64.so) - ASAN_OPTIONS=detect_leaks=0,alloc_dealloc_mismatch=0 \ - LD_PRELOAD=$CLANG_ASAN_LIB:$HIP_ASAN_LIB python3 -m pytest -s test_address_sanitizer.py + ASAN_OPTIONS=detect_leaks=0,alloc_dealloc_mismatch=0 LD_PRELOAD=$CLANG_ASAN_LIB:$HIP_ASAN_LIB python3 -m pytest -s test_address_sanitizer.py + mv $TORCH_PATH/libamdhip64_bck.so $TORCH_PATH/libamdhip64.so - name: Run regression tests run: | make test-regression diff --git a/Makefile b/Makefile index 54ec7f3e8d99..f0bf2342ff9e 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,6 @@ test-unit: all .PHONY: test-distributed test-distributed: all - $(PYTHON) -m pip install --upgrade pip $(PYTHON) -m pip install python/triton_kernels -v $(PYTEST) -s python/triton_kernels/bench/distributed.py diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py index 59bd37950651..50561fc8321f 100644 --- a/python/test/unit/language/test_tensor_descriptor.py +++ b/python/test/unit/language/test_tensor_descriptor.py @@ -1190,8 +1190,21 @@ def trunc_to_tf32(tensor): tf32_simulated = masked_int.view(np.float32) return tf32_simulated + torch.manual_seed(42) + if dtype_str == "float32": + dtype = torch.float32 + elif dtype_str == "float16": + dtype = torch.float16 + else: # bfloat16 + dtype = torch.bfloat16 + # test a layout where block_m and block_N are split into two separate chunks. - A = numpy_random((M, K), dtype_str) + A = torch.rand((M, K), dtype=dtype) + if dtype == torch.bfloat16: + A = A.float().numpy() + else: + A = A.numpy() + if dtype_str == "float32": A = trunc_to_tf32(A) @@ -1204,7 +1217,12 @@ def chunk(X, BLOCK0, BLOCK1): A = to_triton(A, device=device, dst_type=dtype_str) A_reshaped = to_triton(A_reshaped, device=device, dst_type=dtype_str) - B = numpy_random((N, K), dtype_str) + B = torch.rand((N, K), dtype=dtype) + if dtype == torch.bfloat16: + B = B.float().numpy() + else: + B = B.numpy() + if dtype_str == "float32": B = trunc_to_tf32(B) diff --git a/third_party/amd/python/test/address_sanitizer_helper.py b/third_party/amd/python/test/address_sanitizer_helper.py index a40937677695..4434754c7a03 100644 --- a/third_party/amd/python/test/address_sanitizer_helper.py +++ b/third_party/amd/python/test/address_sanitizer_helper.py @@ -1,6 +1,7 @@ import torch import triton import triton.language as tl +import argparse size = 4096 x = torch.rand(size, device='cuda') @@ -28,6 +29,12 @@ def add_kernel( tl.store(output_ptr + offsets, output) -pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) -amdgcn = pgm.asm['amdgcn'] -print(amdgcn) +parser = argparse.ArgumentParser() +parser.add_argument("mode", choices=["warmup", "launch"]) +args = parser.parse_args() +if args.mode == "warmup": + add_kernel.warmup(x, y, output, n_elements, BLOCK_SIZE=1024, grid=grid) +else: # launch + pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) + amdgcn = pgm.asm['amdgcn'] + print(amdgcn) diff --git a/third_party/amd/python/test/test_address_sanitizer.py b/third_party/amd/python/test/test_address_sanitizer.py index be3794ceb0a2..6e1f432d8aa6 100644 --- a/third_party/amd/python/test/test_address_sanitizer.py +++ b/third_party/amd/python/test/test_address_sanitizer.py @@ -32,6 +32,9 @@ def test_address_sanitizer(): # Disable buffer ops given it has builtin support for out of bound access. os.environ["AMDGCN_USE_BUFFER_OPS"] = "0" - out = subprocess.Popen(["python", "address_sanitizer_helper.py"], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + # Workaround: Code hangs if kernel compilation and kernel launch are done in the same step + subprocess.check_call(["python", "address_sanitizer_helper.py", "warmup"]) + out = subprocess.Popen(["python", "address_sanitizer_helper.py", "launch"], stderr=subprocess.PIPE, + stdout=subprocess.PIPE) assert "Begin function __asan_report" in out.stdout.read().decode() assert "heap-buffer-overflow" in out.stderr.read().decode() diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py index c4b515cec388..d5df8d252b05 100644 --- a/third_party/proton/test/test_profile.py +++ b/third_party/proton/test/test_profile.py @@ -123,7 +123,7 @@ def fn(): assert test_frame is not None # {torch.ones, add, foo} if is_hip(): - assert len(test_frame["children"]) >= 2 + assert len(test_frame["children"]) >= 1 else: assert len(test_frame["children"]) >= 3 assert test_frame["children"][0]["metrics"]["time (ns)"] > 0