triton-lang · PMylon · Sep 18, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
@@ -22,7 +22,7 @@ jobs:
             options: >-
               --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
               --volume /home/runner/.triton:/github/home/.triton
-          - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+          - image: pmylonamd/rocm7.0_ubuntu22.04_py3.10_pytorch_2.8.0_asan
             runner: ["amd-gfx942"]
             # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
             options: >-
@@ -44,6 +44,7 @@ jobs:
       PROTON_SKIP_PC_SAMPLING_TEST: 1
       PYTHON: "python3"
       CCACHE_COMPRESS: "true"
+      PIP_BREAK_SYSTEM_PACKAGES: 1
     container:
       image: ${{ matrix.image }}
       options: ${{ matrix.options }}
@@ -157,18 +158,21 @@ jobs:
         run: |
           make test-distributed
       - name: Run asan tests on AMD
-        if: false
+        if: ${{ matrix.runner[0] == 'amd-gfx942' }}
         run: |
           cd third_party/amd/python/test/
           ulimit -s 1024
           export PATH=$(find ~/.triton/llvm -name llvm-symbolizer  -printf '%h\n'):$PATH
-          export LD_LIBRARY_PATH=$(find /opt -name libclang_rt.asan-x86_64.so -printf '%h\n'):$LD_LIBRARY_PATH
+          export LD_LIBRARY_PATH=$(find /opt -name libclang_rt.asan-x86_64.so -printf '%h\n' | head -n1):$LD_LIBRARY_PATH
           export LD_LIBRARY_PATH=$(find /opt -type d -wholename *lib/llvm/lib/asan):$LD_LIBRARY_PATH
-          export LD_LIBRARY_PATH=$(find /usr -name libcaffe2_nvrtc.so -printf '%h\n'):$LD_LIBRARY_PATH
-          export CLANG_ASAN_LIB=$(find /opt -name libclang_rt.asan-x86_64.so)
+          TORCH_PATH=$(find /opt -name libcaffe2_nvrtc.so -printf '%h\n')
+          mv $TORCH_PATH/libamdhip64.so $TORCH_PATH/libamdhip64_bck.so
+          export LD_LIBRARY_PATH=$TORCH_PATH:$LD_LIBRARY_PATH
+          export LD_LIBRARY_PATH=$(find /opt -wholename *lib/asan/libamdhip64.so -printf '%h\n'):$LD_LIBRARY_PATH
+          export CLANG_ASAN_LIB=$(find /opt -name libclang_rt.asan-x86_64.so | head -n1)
           export HIP_ASAN_LIB=$(find /opt -wholename *lib/asan/libamdhip64.so)
-          ASAN_OPTIONS=detect_leaks=0,alloc_dealloc_mismatch=0 \
-          LD_PRELOAD=$CLANG_ASAN_LIB:$HIP_ASAN_LIB python3 -m pytest -s test_address_sanitizer.py
+          ASAN_OPTIONS=detect_leaks=0,alloc_dealloc_mismatch=0 LD_PRELOAD=$CLANG_ASAN_LIB:$HIP_ASAN_LIB python3 -m pytest -s test_address_sanitizer.py
+          mv $TORCH_PATH/libamdhip64_bck.so $TORCH_PATH/libamdhip64.so
       - name: Run regression tests
         run: |
           make test-regression

@@ -47,7 +47,6 @@ test-unit: all
 
 .PHONY: test-distributed
 test-distributed: all
-	$(PYTHON) -m pip install --upgrade pip
 	$(PYTHON) -m pip install python/triton_kernels -v
 	$(PYTEST) -s python/triton_kernels/bench/distributed.py
 

@@ -1190,8 +1190,21 @@ def trunc_to_tf32(tensor):
         tf32_simulated = masked_int.view(np.float32)
         return tf32_simulated
 
+    torch.manual_seed(42)
+    if dtype_str == "float32":
+        dtype = torch.float32
+    elif dtype_str == "float16":
+        dtype = torch.float16
+    else:  # bfloat16
+        dtype = torch.bfloat16
+
     # test a layout where block_m and block_N are split into two separate chunks.
-    A = numpy_random((M, K), dtype_str)
+    A = torch.rand((M, K), dtype=dtype)
+    if dtype == torch.bfloat16:
+        A = A.float().numpy()
+    else:
+        A = A.numpy()
+
     if dtype_str == "float32":
         A = trunc_to_tf32(A)
 
@@ -1204,7 +1217,12 @@ def chunk(X, BLOCK0, BLOCK1):
     A = to_triton(A, device=device, dst_type=dtype_str)
     A_reshaped = to_triton(A_reshaped, device=device, dst_type=dtype_str)
 
-    B = numpy_random((N, K), dtype_str)
+    B = torch.rand((N, K), dtype=dtype)
+    if dtype == torch.bfloat16:
+        B = B.float().numpy()
+    else:
+        B = B.numpy()
+
     if dtype_str == "float32":
         B = trunc_to_tf32(B)
 

@@ -1,6 +1,7 @@
 import torch
 import triton
 import triton.language as tl
+import argparse
 
 size = 4096
 x = torch.rand(size, device='cuda')
@@ -28,6 +29,12 @@ def add_kernel(
     tl.store(output_ptr + offsets, output)
 
 
-pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
-amdgcn = pgm.asm['amdgcn']
-print(amdgcn)
+parser = argparse.ArgumentParser()
+parser.add_argument("mode", choices=["warmup", "launch"])
+args = parser.parse_args()
+if args.mode == "warmup":
+    add_kernel.warmup(x, y, output, n_elements, BLOCK_SIZE=1024, grid=grid)
+else:  # launch
+    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    amdgcn = pgm.asm['amdgcn']
+    print(amdgcn)
@@ -32,6 +32,9 @@ def test_address_sanitizer():
     # Disable buffer ops given it has builtin support for out of bound access.
     os.environ["AMDGCN_USE_BUFFER_OPS"] = "0"
 
-    out = subprocess.Popen(["python", "address_sanitizer_helper.py"], stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+    # Workaround: Code hangs if kernel compilation and kernel launch are done in the same step
+    subprocess.check_call(["python", "address_sanitizer_helper.py", "warmup"])
+    out = subprocess.Popen(["python", "address_sanitizer_helper.py", "launch"], stderr=subprocess.PIPE,
+                           stdout=subprocess.PIPE)
     assert "Begin function __asan_report" in out.stdout.read().decode()
     assert "heap-buffer-overflow" in out.stderr.read().decode()
@@ -123,7 +123,7 @@ def fn():
     assert test_frame is not None
     # {torch.ones, add, foo}
     if is_hip():
-        assert len(test_frame["children"]) >= 2
+        assert len(test_frame["children"]) >= 1
     else:
         assert len(test_frame["children"]) >= 3
     assert test_frame["children"][0]["metrics"]["time (ns)"] > 0