From 9b1def10f707e55ec837e43c93bdf18b7bc1f055 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 23 Oct 2025 13:42:10 +0000 Subject: [PATCH 01/16] Fixed sglang testing, added pin, updated fix --- .github/workflows/sglang-tests.yml | 100 -------- .github/workflows/third-party-tests.yml | 81 +++++- .../third_party/sglang/sglang-fix.patch | 242 ++++++++++++++++-- benchmarks/third_party/sglang/sglang-pin.txt | 1 + scripts/test-triton.sh | 44 +++- 5 files changed, 321 insertions(+), 147 deletions(-) delete mode 100644 .github/workflows/sglang-tests.yml create mode 100644 benchmarks/third_party/sglang/sglang-pin.txt diff --git a/.github/workflows/sglang-tests.yml b/.github/workflows/sglang-tests.yml deleted file mode 100644 index dc5cabc991..0000000000 --- a/.github/workflows/sglang-tests.yml +++ /dev/null @@ -1,100 +0,0 @@ -name: Third party SGLang tests - -on: - workflow_dispatch: - inputs: - runner_label: - description: Runner label, keep empty for default - type: string - default: "" - use_pyenv_python: - description: Use Python built with pyenv - type: boolean - default: false - schedule: - # About midnight PST Sunday (UTC-8) - - cron: "5 10 * * SUN" - - -# Cancels in-progress PR runs when the PR is updated. Manual runs are never cancelled. -concurrency: - group: ${{ github.workflow }}-${{ github.event_name == 'workflow_dispatch' && github.run_id || github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: read-all - -env: - PYTHON_VERSION: "3.10" - TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} - -jobs: - build: - name: SGLang tests - runs-on: - - linux - - ${{ inputs.runner_label || 'rolling' }} - timeout-minutes: 720 - defaults: - run: - shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" - steps: - - name: Print inputs - run: | - cat <> $GITHUB_ENV - - - name: Install SGLang - id: install - run: | - git clone https://github.com/sgl-project/sglang.git - cd sglang - git apply ../benchmarks/third_party/sglang/sglang-fix.patch - pip install "./python[dev_xpu]" - - - name: Setup PyTorch - uses: ./.github/actions/setup-pytorch - - - name: Setup Triton - uses: ./.github/actions/setup-triton - - - name: Run SGLANG tests - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - run: | - ./scripts/test-triton.sh --sglang --skip-pip-install --skip-pytorch-install - - - name: Upload test report - if: ${{ steps.install.outcome == 'success' && !cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: test-reports - path: reports diff --git a/.github/workflows/third-party-tests.yml b/.github/workflows/third-party-tests.yml index 41a38b5c3f..8415d15b86 100644 --- a/.github/workflows/third-party-tests.yml +++ b/.github/workflows/third-party-tests.yml @@ -1,4 +1,4 @@ -name: Third party tests [liger-kernels, vllm] +name: Third party tests [liger-kernels, vllm, sglang] on: workflow_dispatch: @@ -28,12 +28,12 @@ env: TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} jobs: - build: - name: Third party tests [liger-kernels, vllm] + small-tests: + name: Third party tests [vllm, sglang] runs-on: - linux - ${{ inputs.runner_label || 'max1550' }} - timeout-minutes: 720 + timeout-minutes: 120 defaults: run: shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" @@ -47,14 +47,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v5 - - name: Install Python - if: ${{ !(inputs.use_pyenv_python || false) }} - uses: actions/setup-python@v6 - with: - python-version: ${{ env.PYTHON_VERSION }} - - name: Install Python (from pyenv) ${{ inputs.python_version }} - if: ${{ inputs.use_pyenv_python }} uses: ./.github/actions/setup-pyenv-python with: python-version: ${{ env.PYTHON_VERSION }} @@ -86,13 +79,75 @@ jobs: mkdir reports echo "REPORTS=$PWD/reports" >> $GITHUB_ENV + - name: Run SGLANG tests + if: ${{ steps.install.outcome == 'success' && !cancelled() }} + run: | + ./scripts/test-triton.sh --sglang --skip-pip-install --skip-pytorch-install + - name: Run VLLM tests if: ${{ steps.install.outcome == 'success' && !cancelled() }} run: | ./scripts/test-triton.sh --vllm --skip-pip-install --skip-pytorch-install - - name: Run Liger-Kernel tests + - name: Upload test report if: ${{ steps.install.outcome == 'success' && !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: test-main-reports + path: reports + # We run all tests for Liger, so it's slow and we test it separately + liger: + name: Liger testing + runs-on: + - linux + - ${{ inputs.runner_label || 'max1550' }} + timeout-minutes: 120 + defaults: + run: + shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" + steps: + - name: Print inputs + run: | + cat <> $GITHUB_ENV + + - name: Run Liger-Kernel tests run: | ./scripts/test-triton.sh --liger --skip-pip-install --skip-pytorch-install @@ -100,5 +155,5 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() }} uses: actions/upload-artifact@v4 with: - name: test-reports + name: test-liger-reports path: reports diff --git a/benchmarks/third_party/sglang/sglang-fix.patch b/benchmarks/third_party/sglang/sglang-fix.patch index 9b9d38dc43..b3769b6385 100644 --- a/benchmarks/third_party/sglang/sglang-fix.patch +++ b/benchmarks/third_party/sglang/sglang-fix.patch @@ -1,9 +1,9 @@ -diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py -index bc2affa1..8ef91e66 100644 ---- a/python/sglang/srt/utils.py -+++ b/python/sglang/srt/utils.py -@@ -228,6 +228,22 @@ def is_flashinfer_available(): - return importlib.util.find_spec("flashinfer") is not None and is_cuda() +diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py +index 7c2f573e4..8023cd6be 100644 +--- a/python/sglang/srt/utils/common.py ++++ b/python/sglang/srt/utils/common.py +@@ -155,12 +155,44 @@ def is_cpu() -> bool: + return os.getenv("SGLANG_USE_CPU_ENGINE", "0") == "1" and is_host_cpu_x86() +def auto_detect_device(): @@ -22,26 +22,48 @@ index bc2affa1..8ef91e66 100644 + return "cpu" + + - _ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var( - "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false" - ) + def get_cuda_version(): + if torch.version.cuda: + return tuple(map(int, torch.version.cuda.split("."))) + return (0, 0) + + ++def auto_detect_device(): ++ """ ++ Infer the device type based on the current environment. ++ """ ++ if is_cuda_alike(): ++ return "cuda" ++ elif is_xpu(): ++ return "xpu" ++ elif is_hpu(): ++ return "hpu" ++ elif is_npu(): ++ return "npu" ++ else: ++ return "cpu" ++ ++ + def _check(cc_major): + if not is_cuda(): + return False diff --git a/test/srt/test_triton_attention_kernels.py b/test/srt/test_triton_attention_kernels.py -index 47eb16a9..cce70fb9 100644 +index 16c107006..03b9411fa 100644 --- a/test/srt/test_triton_attention_kernels.py +++ b/test/srt/test_triton_attention_kernels.py -@@ -16,8 +16,11 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import ( +@@ -18,8 +18,11 @@ from sglang.srt.layers.attention.triton_ops.extend_attention import ( + from sglang.srt.layers.attention.triton_ops.prefill_attention import ( context_attention_fwd, ) - from sglang.test.test_utils import CustomTestCase +from sglang.srt.utils import auto_detect_device - + from sglang.test.test_utils import CustomTestCase +device = auto_detect_device() + - class TestTritonAttention(CustomTestCase): - def _set_all_seeds(self, seed): -@@ -37,24 +40,24 @@ class TestTritonAttention(CustomTestCase): + def extend_attention_fwd_torch( + q: torch.Tensor, # [extend_tokens, H_Q, D] +@@ -114,24 +117,24 @@ class TestTritonAttention(CustomTestCase): dtype = torch.bfloat16 b_seq_len_prefix = torch.randint( @@ -73,7 +95,7 @@ index 47eb16a9..cce70fb9 100644 ) for i in range(B): -@@ -65,15 +68,15 @@ class TestTritonAttention(CustomTestCase): +@@ -142,15 +145,15 @@ class TestTritonAttention(CustomTestCase): total_token_num = torch.sum(b_seq_len).item() extend_token_num = torch.sum(b_seq_len_extend).item() k_buffer = torch.empty( @@ -94,7 +116,7 @@ index 47eb16a9..cce70fb9 100644 for i in range(B): extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i] extend_end_in_buffer = b_start_loc[i] + b_seq_len[i] -@@ -86,20 +89,20 @@ class TestTritonAttention(CustomTestCase): +@@ -163,20 +166,20 @@ class TestTritonAttention(CustomTestCase): extend_start_in_buffer:extend_end_in_buffer ] q_extend[extend_start:extend_end] = torch.empty( @@ -120,7 +142,7 @@ index 47eb16a9..cce70fb9 100644 qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0) custom_mask = None -@@ -123,9 +126,9 @@ class TestTritonAttention(CustomTestCase): +@@ -200,9 +203,9 @@ class TestTritonAttention(CustomTestCase): b_seq_mask_len = b_seq_len_extend * b_seq_len custom_mask = torch.ones( @@ -132,7 +154,81 @@ index 47eb16a9..cce70fb9 100644 mask_indptr[1 : B + 1] = torch.cumsum(b_seq_mask_len[:B], dim=0) for i in range(B): causal_mask = ( -@@ -187,14 +190,14 @@ class TestTritonAttention(CustomTestCase): +@@ -263,22 +266,22 @@ class TestTritonAttention(CustomTestCase): + dtype = torch.bfloat16 + + b_seq_len_prefix = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len_extend = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len = b_seq_len_prefix + b_seq_len_extend + +- b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0) +- b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0) + +- kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0) + kv_indices = torch.zeros( +- (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device="cuda" ++ (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device=device + ) + + for i in range(B): +@@ -289,15 +292,15 @@ class TestTritonAttention(CustomTestCase): + total_token_num = torch.sum(b_seq_len).item() + extend_token_num = torch.sum(b_seq_len_extend).item() + k_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + v_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + +- k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + for i in range(B): + extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i] + extend_end_in_buffer = b_start_loc[i] + b_seq_len[i] +@@ -310,19 +313,19 @@ class TestTritonAttention(CustomTestCase): + extend_start_in_buffer:extend_end_in_buffer + ] + q_extend[extend_start:extend_end] = torch.empty( +- (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda" ++ (b_seq_len_extend[i], H_Q, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + + o_extend_triton = torch.empty( +- (extend_token_num, H_Q, D), dtype=dtype, device="cuda" ++ (extend_token_num, H_Q, D), dtype=dtype, device=device + ) + o_extend_torch = torch.empty( +- (extend_token_num, H_Q, D), dtype=dtype, device="cuda" ++ (extend_token_num, H_Q, D), dtype=dtype, device=device + ) + + b_seq_len_extend = b_seq_len - b_seq_len_prefix + max_len_extend = torch.max(b_seq_len_extend, 0)[0].item() +- qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0) + + extend_attention_fwd( +@@ -373,14 +376,14 @@ class TestTritonAttention(CustomTestCase): max_seq_len = max(seq_lens) # Create random input tensors @@ -153,7 +249,7 @@ index 47eb16a9..cce70fb9 100644 context_attention_fwd( q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=is_causal -@@ -232,33 +235,33 @@ class TestTritonAttention(CustomTestCase): +@@ -418,33 +421,33 @@ class TestTritonAttention(CustomTestCase): total_tokens = B * seq_len sm_scale = 1.0 / (D**0.5) max_kv_splits = 8 @@ -197,7 +293,7 @@ index 47eb16a9..cce70fb9 100644 ) decode_attention_fwd( -@@ -296,34 +299,34 @@ class TestTritonAttention(CustomTestCase): +@@ -482,34 +485,34 @@ class TestTritonAttention(CustomTestCase): total_tokens = B * seq_len sm_scale = 1.0 / (D**0.5) max_kv_splits = 8 @@ -243,7 +339,7 @@ index 47eb16a9..cce70fb9 100644 ) decode_attention_fwd_normal( -@@ -343,12 +346,12 @@ class TestTritonAttention(CustomTestCase): +@@ -529,12 +532,12 @@ class TestTritonAttention(CustomTestCase): attn_logits1 = torch.empty( (B, H_Q, max_kv_splits, D_V), dtype=torch.float32, @@ -258,3 +354,103 @@ index 47eb16a9..cce70fb9 100644 ) decode_attention_fwd_grouped( +@@ -578,23 +581,23 @@ class TestTritonAttention(CustomTestCase): + dtype = torch.bfloat16 + + b_seq_len_prefix = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len_extend = torch.randint( +- 1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda" ++ 1, N_CTX // 2, (B,), dtype=torch.int32, device=device + ) + b_seq_len = b_seq_len_prefix + b_seq_len_extend + +- b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0) +- b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device) + b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0) + + # Setup prefix KV indices +- kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0) + kv_indices = torch.zeros( +- (b_seq_len_prefix.sum().item(),), dtype=torch.int64, device="cuda" ++ (b_seq_len_prefix.sum().item(),), dtype=torch.int64, device=device + ) + + for i in range(B): +@@ -605,15 +608,15 @@ class TestTritonAttention(CustomTestCase): + total_token_num = torch.sum(b_seq_len).item() + extend_token_num = torch.sum(b_seq_len_extend).item() + k_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + v_buffer = torch.empty( +- (total_token_num, H_KV, D), dtype=dtype, device="cuda" ++ (total_token_num, H_KV, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + +- k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda") +- q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device) ++ q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + + for i in range(B): + extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i] +@@ -627,16 +630,16 @@ class TestTritonAttention(CustomTestCase): + extend_start_in_buffer:extend_end_in_buffer + ] + q_extend[extend_start:extend_end] = torch.empty( +- (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda" ++ (b_seq_len_extend[i], H_Q, D), dtype=dtype, device=device + ).normal_(mean=0.1, std=0.2) + + # Setup for extend attention + max_len_extend = torch.max(b_seq_len_extend, 0)[0].item() +- qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda") ++ qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device) + qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0) + + # Run 2-stage kernel +- o_regular = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ o_regular = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + extend_attention_fwd( + q_extend, + k_extend, +@@ -658,9 +661,9 @@ class TestTritonAttention(CustomTestCase): + total_token_num - extend_token_num, + total_token_num, + dtype=torch.int64, +- device="cuda", ++ device=device, + ) +- extend_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda") ++ extend_start_loc = torch.zeros((B,), dtype=torch.int32, device=device) + extend_start_loc[1:] = torch.cumsum(b_seq_len_extend[:-1], 0) + + unified_kv_indptr, unified_kv_indices, prefix_lens = build_unified_kv_indices( +@@ -673,7 +676,7 @@ class TestTritonAttention(CustomTestCase): + ) + + # Run unified kernel +- o_unified = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda") ++ o_unified = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device) + extend_attention_fwd_unified( + q_extend, + o_unified, +@@ -716,7 +719,6 @@ class TestTritonAttention(CustomTestCase): + """Test build_unified_kv_indices correctness.""" + B = 4 + dtype = torch.int64 +- device = "cuda" + + # Setup test data + prefix_lens = torch.tensor([10, 20, 15, 25], dtype=torch.int32, device=device) diff --git a/benchmarks/third_party/sglang/sglang-pin.txt b/benchmarks/third_party/sglang/sglang-pin.txt new file mode 100644 index 0000000000..8f8517ba4b --- /dev/null +++ b/benchmarks/third_party/sglang/sglang-pin.txt @@ -0,0 +1 @@ +d6fee73d1f593bd6754cd2550775fd2e54aeae60 diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 0bdc5de7ad..27ef883d99 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -30,6 +30,7 @@ TEST: --liger --vllm --install-vllm + --install-sglang OPTION: --unskip @@ -74,6 +75,7 @@ TEST_SGLANG=false TEST_LIGER=false TEST_VLLM=false INSTALL_VLLM=false +INSTALL_SGLANG=false TEST_TRITON_KERNELS=false VENV=false TRITON_TEST_REPORTS=false @@ -190,6 +192,11 @@ while (( $# != 0 )); do TEST_DEFAULT=false shift ;; + --install-sglang) + INSTALL_SGLANG=true + TEST_DEFAULT=false + shift + ;; --sglang) TEST_SGLANG=true TEST_DEFAULT=false @@ -589,26 +596,38 @@ run_inductor_tests() { grep AlbertForMaskedLM inductor_log.csv | grep -q ,pass, } -run_sglang_tests() { - echo "***************************************************" - echo "****** Running SGLang Triton tests ******" - echo "***************************************************" +run_sglang_install() { + echo "************************************************" + echo "****** Installing SGLang ****" + echo "************************************************" if ! [ -d "./sglang" ]; then git clone https://github.com/sgl-project/sglang.git fi - cd sglang if ! pip list | grep "sglang" ; then - git apply $TRITON_PROJ/benchmarks/third_party/sglang/sglang-fix.patch + cd sglang + git checkout "$(<../benchmarks/third_party/sglang/sglang-pin.txt)" + git apply ../benchmarks/third_party/sglang/sglang-fix.patch + + # That's how sglang assumes we'll pick out platform for now + cp python/pyproject_xpu.toml python/pyproject.toml + # We should remove all torch libraries from requirements to avoid reinstalling triton & torch + # We remove sgl kernel due to a bug in the current environment probably due to using newer torch + sed -i '/pytorch\|torch\|sgl-kernel/d' python/pyproject.toml pip install "./python[dev_xpu]" - - # SGLang installation breaks the default PyTorch and Triton versions, so we need to reinstall them. - $SCRIPTS_DIR/install-pytorch.sh --force-reinstall - $SCRIPTS_DIR/compile-triton.sh --triton + cd .. fi - pip install pytest pytest-xdist + pip install pytest pytest-cov pytest-xdist +} + +run_sglang_tests() { + echo "***************************************************" + echo "****** Running SGLang Triton tests ******" + echo "***************************************************" + + run_sglang_install run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} test/srt/test_triton_attention_kernels.py } @@ -745,6 +764,9 @@ test_triton() { if [ "$TEST_INDUCTOR" == true ]; then run_inductor_tests fi + if [ "$INSTALL_SGLANG" == true ]; then + run_sglang_install + fi if [ "$TEST_SGLANG" == true ]; then run_sglang_tests fi From 7875d12c46388b6abd70bd7dc45a5216ee73b6ca Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 23 Oct 2025 15:06:55 +0000 Subject: [PATCH 02/16] Debug --- scripts/test-triton.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 27ef883d99..ff1ac3f0e9 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -628,6 +628,7 @@ run_sglang_tests() { echo "***************************************************" run_sglang_install + cd sglang run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} test/srt/test_triton_attention_kernels.py } @@ -667,8 +668,9 @@ run_vllm_install() { cd vllm-xpu-kernels git checkout "$(<../benchmarks/third_party/vllm/vllm-kernels-pin.txt)" sed -i '/pytorch\|torch/d' requirements.txt + sed -i '/pytorch\|torch/d' pyproject.toml pip install -r requirements.txt - VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . + VLLM_TARGET_DEVICE=xpu pip install -vvv --no-build-isolation . cd .. VLLM_TARGET_DEVICE=xpu pip install --no-deps --no-build-isolation -e vllm @@ -691,7 +693,7 @@ run_vllm_tests() { run_triton_kernels_tests() { echo "***************************************************" - echo "****** Running Triton Kernels tests ******" + echo "****** Running Triton Kernels tests ******"requirements.txpt echo "***************************************************" cd $TRITON_PROJ/python/triton_kernels/tests From 40da9ae42ad09ee02a57ff349112019e0a615054 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 23 Oct 2025 15:52:57 +0000 Subject: [PATCH 03/16] Debug --- .github/workflows/third-party-tests.yml | 9 +-------- scripts/test-triton.sh | 4 +++- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/third-party-tests.yml b/.github/workflows/third-party-tests.yml index 8415d15b86..9fda3ed4f1 100644 --- a/.github/workflows/third-party-tests.yml +++ b/.github/workflows/third-party-tests.yml @@ -64,15 +64,8 @@ jobs: - name: Setup PyTorch uses: ./.github/actions/setup-pytorch - - name: Build Triton wheels + - name: Setup Triton uses: ./.github/actions/setup-triton - with: - command: DEBUG=1 python -m build --wheel --no-isolation - - - name: Install Triton - id: install - run: | - pip install dist/*.whl - name: Create reports dir run: | diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index ff1ac3f0e9..c588910bc8 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -615,7 +615,9 @@ run_sglang_install() { # We should remove all torch libraries from requirements to avoid reinstalling triton & torch # We remove sgl kernel due to a bug in the current environment probably due to using newer torch sed -i '/pytorch\|torch\|sgl-kernel/d' python/pyproject.toml - pip install "./python[dev_xpu]" + echo "pyproject.toml after modification:" + cat python/pyproject.toml + pip install -e "./python" cd .. fi From a638d14ddf212148c708fd698642272a8628fb78 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 23 Oct 2025 16:08:39 +0000 Subject: [PATCH 04/16] Debug --- .github/workflows/third-party-tests.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/third-party-tests.yml b/.github/workflows/third-party-tests.yml index 9fda3ed4f1..2dac6d56ba 100644 --- a/.github/workflows/third-party-tests.yml +++ b/.github/workflows/third-party-tests.yml @@ -65,6 +65,7 @@ jobs: uses: ./.github/actions/setup-pytorch - name: Setup Triton + id: install uses: ./.github/actions/setup-triton - name: Create reports dir @@ -125,15 +126,9 @@ jobs: - name: Setup PyTorch uses: ./.github/actions/setup-pytorch - - name: Build Triton wheels - uses: ./.github/actions/setup-triton - with: - command: DEBUG=1 python -m build --wheel --no-isolation - - - name: Install Triton + - name: Setup Triton id: install - run: | - pip install dist/*.whl + uses: ./.github/actions/setup-triton - name: Create reports dir run: | From 31f641d3393319bb71d80081f047e01d5bd91ede Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 07:58:11 +0000 Subject: [PATCH 05/16] Debug --- scripts/test-triton.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index c588910bc8..437a3aaab5 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -617,7 +617,7 @@ run_sglang_install() { sed -i '/pytorch\|torch\|sgl-kernel/d' python/pyproject.toml echo "pyproject.toml after modification:" cat python/pyproject.toml - pip install -e "./python" + pip install -vvv -e "./python" cd .. fi @@ -695,7 +695,7 @@ run_vllm_tests() { run_triton_kernels_tests() { echo "***************************************************" - echo "****** Running Triton Kernels tests ******"requirements.txpt + echo "****** Running Triton Kernels tests *******" echo "***************************************************" cd $TRITON_PROJ/python/triton_kernels/tests From 4b961f05451c625397f09613708ddf53212477fb Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 07:59:19 +0000 Subject: [PATCH 06/16] Debug --- scripts/test-triton.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 437a3aaab5..7f43a6aab3 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -618,6 +618,8 @@ run_sglang_install() { echo "pyproject.toml after modification:" cat python/pyproject.toml pip install -vvv -e "./python" + pip install pipdeptree + pipdeptree -r -p torch cd .. fi From 7e60298605276b266c5a642628b01b96dd0435bf Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 08:24:09 +0000 Subject: [PATCH 07/16] Debug --- scripts/test-triton.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 7f43a6aab3..f200468a76 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -649,7 +649,7 @@ run_liger_tests() { pip install pytest pytest-xdist pytest-cov transformers pandas pytest datasets -e Liger-Kernel fi - run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} Liger-Kernel/test/ + run_pytest_command -vv -n ${PYTEST_MAX_PROCESSES:-4} Liger-Kernel/test/ } run_vllm_install() { @@ -674,7 +674,7 @@ run_vllm_install() { sed -i '/pytorch\|torch/d' requirements.txt sed -i '/pytorch\|torch/d' pyproject.toml pip install -r requirements.txt - VLLM_TARGET_DEVICE=xpu pip install -vvv --no-build-isolation . + VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation . cd .. VLLM_TARGET_DEVICE=xpu pip install --no-deps --no-build-isolation -e vllm From 74fd84910a558f38492d5b31b5b416454546de05 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 08:56:03 +0000 Subject: [PATCH 08/16] Debug --- scripts/test-triton.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index f200468a76..9fb877d531 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -613,11 +613,12 @@ run_sglang_install() { # That's how sglang assumes we'll pick out platform for now cp python/pyproject_xpu.toml python/pyproject.toml # We should remove all torch libraries from requirements to avoid reinstalling triton & torch - # We remove sgl kernel due to a bug in the current environment probably due to using newer torch + # We remove sgl kernel due to a bug in the current environment probably due to using newer torch, we don't currently use it anyway + # We remove timm because it depends on torchvision, which depends on torch==2.9 sed -i '/pytorch\|torch\|sgl-kernel/d' python/pyproject.toml echo "pyproject.toml after modification:" cat python/pyproject.toml - pip install -vvv -e "./python" + pip install -e "./python" pip install pipdeptree pipdeptree -r -p torch cd .. From b2874409655a796c3c5118c35a6d84c76d33be23 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 09:21:27 +0000 Subject: [PATCH 09/16] Debug --- scripts/test-triton.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 9fb877d531..fe278c4499 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -615,7 +615,7 @@ run_sglang_install() { # We should remove all torch libraries from requirements to avoid reinstalling triton & torch # We remove sgl kernel due to a bug in the current environment probably due to using newer torch, we don't currently use it anyway # We remove timm because it depends on torchvision, which depends on torch==2.9 - sed -i '/pytorch\|torch\|sgl-kernel/d' python/pyproject.toml + sed -i '/pytorch\|torch\|sgl-kernel\|timm/d' python/pyproject.toml echo "pyproject.toml after modification:" cat python/pyproject.toml pip install -e "./python" From 950d196cce2e8409ec2c580985023c8eaa5bd20d Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 09:54:38 +0000 Subject: [PATCH 10/16] Cleanup --- scripts/test-triton.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index fe278c4499..575ca2461d 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -616,11 +616,8 @@ run_sglang_install() { # We remove sgl kernel due to a bug in the current environment probably due to using newer torch, we don't currently use it anyway # We remove timm because it depends on torchvision, which depends on torch==2.9 sed -i '/pytorch\|torch\|sgl-kernel\|timm/d' python/pyproject.toml - echo "pyproject.toml after modification:" cat python/pyproject.toml pip install -e "./python" - pip install pipdeptree - pipdeptree -r -p torch cd .. fi From 787f6fc4c29b792ac3d2893a5ed2f12e1a1ffeb1 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 10:21:12 +0000 Subject: [PATCH 11/16] Cleaned up --- scripts/test-triton.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 575ca2461d..bc347f4a00 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -647,7 +647,7 @@ run_liger_tests() { pip install pytest pytest-xdist pytest-cov transformers pandas pytest datasets -e Liger-Kernel fi - run_pytest_command -vv -n ${PYTEST_MAX_PROCESSES:-4} Liger-Kernel/test/ + run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} Liger-Kernel/test/ } run_vllm_install() { @@ -672,7 +672,7 @@ run_vllm_install() { sed -i '/pytorch\|torch/d' requirements.txt sed -i '/pytorch\|torch/d' pyproject.toml pip install -r requirements.txt - VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation . + VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . cd .. VLLM_TARGET_DEVICE=xpu pip install --no-deps --no-build-isolation -e vllm From bf8bf04affbad9cc29497a54454af2a641eecf48 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 12:01:08 +0000 Subject: [PATCH 12/16] Added patch to liger kernels, refactor liger install --- .github/workflows/third-party-benchmarks.yml | 9 ++--- .../{liger_kernels => liger}/README.md | 0 .../run_benchmarks.sh | 0 .../{liger_kernels => liger}/transform.py | 0 scripts/test-triton.sh | 40 ++++++++++++++----- 5 files changed, 34 insertions(+), 15 deletions(-) rename benchmarks/third_party/{liger_kernels => liger}/README.md (100%) rename benchmarks/third_party/{liger_kernels => liger}/run_benchmarks.sh (100%) rename benchmarks/third_party/{liger_kernels => liger}/transform.py (100%) diff --git a/.github/workflows/third-party-benchmarks.yml b/.github/workflows/third-party-benchmarks.yml index 6612f937f5..866ec24cc7 100644 --- a/.github/workflows/third-party-benchmarks.yml +++ b/.github/workflows/third-party-benchmarks.yml @@ -108,17 +108,14 @@ jobs: run: | source ./scripts/capture-hw-details.sh - cd benchmarks/third_party/liger_kernels - - git clone https://github.com/linkedin/Liger-Kernel - pip install -e Liger-Kernel + ./scripts/test-triton.sh --install-vllm --skip-pip-install --skip-pytorch-install # To remember return code, but still copy results RET_CODE=0 - bash ./run_benchmarks.sh || RET_CODE=$? + bash benchmarks/third_party/liger/run_benchmarks.sh || RET_CODE=$? cp Liger-Kernel/benchmark/data/all_benchmark_data.csv $REPORTS/liger-raw.csv - python transform.py $REPORTS/liger-raw.csv $REPORTS/liger-report.csv --tag $TAG + python benchmarks/third_party/liger/transform.py $REPORTS/liger-raw.csv $REPORTS/liger-report.csv --tag $TAG # Return the captured return code at the end exit "$RET_CODE" diff --git a/benchmarks/third_party/liger_kernels/README.md b/benchmarks/third_party/liger/README.md similarity index 100% rename from benchmarks/third_party/liger_kernels/README.md rename to benchmarks/third_party/liger/README.md diff --git a/benchmarks/third_party/liger_kernels/run_benchmarks.sh b/benchmarks/third_party/liger/run_benchmarks.sh similarity index 100% rename from benchmarks/third_party/liger_kernels/run_benchmarks.sh rename to benchmarks/third_party/liger/run_benchmarks.sh diff --git a/benchmarks/third_party/liger_kernels/transform.py b/benchmarks/third_party/liger/transform.py similarity index 100% rename from benchmarks/third_party/liger_kernels/transform.py rename to benchmarks/third_party/liger/transform.py diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index bc347f4a00..148354e981 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -596,6 +596,10 @@ run_inductor_tests() { grep AlbertForMaskedLM inductor_log.csv | grep -q ,pass, } +run_test_deps_install() { + pip install pytest pytest-cov pytest-xdist +} + run_sglang_install() { echo "************************************************" echo "****** Installing SGLang ****" @@ -620,8 +624,6 @@ run_sglang_install() { pip install -e "./python" cd .. fi - - pip install pytest pytest-cov pytest-xdist } run_sglang_tests() { @@ -630,23 +632,39 @@ run_sglang_tests() { echo "***************************************************" run_sglang_install + run_test_deps_install cd sglang run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} test/srt/test_triton_attention_kernels.py } -run_liger_tests() { +run_liger_install() { echo "************************************************" - echo "****** Running Liger Triton tests ******" + echo "****** Installing Liger-Kernel ******" echo "************************************************" if ! [ -d "./Liger-Kernel" ]; then git clone https://github.com/linkedin/Liger-Kernel + + # There is probably an issue with cache + # Will try to upstream the patch here: + # https://github.com/linkedin/Liger-Kernel/pull/917 + # After merging we can remove this patch application + git apply benchmarks/third_party/liger/liger-fix.patch fi if ! pip list | grep "liger_kernel" ; then - pip install pytest pytest-xdist pytest-cov transformers pandas pytest datasets -e Liger-Kernel + pip install transformers pandas datasets -e Liger-Kernel fi +} + + +run_liger_tests() { + echo "************************************************" + echo "****** Running Liger-Kernel tests ******" + echo "************************************************" + run_liger_install + run_test_deps_install run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} Liger-Kernel/test/ } @@ -678,7 +696,7 @@ run_vllm_install() { VLLM_TARGET_DEVICE=xpu pip install --no-deps --no-build-isolation -e vllm fi - pip install pytest pytest-cov pytest-xdist cachetools cbor2 blake3 pybase64 openai_harmony tblib + pip install cachetools cbor2 blake3 pybase64 openai_harmony tblib } @@ -688,6 +706,7 @@ run_vllm_tests() { echo "************************************************" run_vllm_install + run_test_deps_install cd vllm run_pytest_command -vvv tests/kernels/moe/test_batched_moe.py tests/kernels/attention/test_triton_unified_attention.py @@ -774,15 +793,18 @@ test_triton() { if [ "$TEST_SGLANG" == true ]; then run_sglang_tests fi + if [ "$INSTALL_LIGER" == true ]; then + run_liger_install + fi if [ "$TEST_LIGER" == true ]; then run_liger_tests fi - if [ "$TEST_VLLM" == true ]; then - run_vllm_tests - fi if [ "$INSTALL_VLLM" == true ]; then run_vllm_install fi + if [ "$TEST_VLLM" == true ]; then + run_vllm_tests + fi if [ "$TEST_TRITON_KERNELS" == true ]; then run_triton_kernels_tests fi From cc48db6a8ca6ad06d68e5c94590c02c48b973a16 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 13:44:11 +0000 Subject: [PATCH 13/16] Fixes --- scripts/test-triton.sh | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 148354e981..5899628fdf 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -26,11 +26,12 @@ TEST: --flex-attention --instrumentation --inductor - --sglang - --liger --vllm --install-vllm + --sglang --install-sglang + --liger + --install-liger OPTION: --unskip @@ -72,10 +73,11 @@ TEST_BENCHMARK_FLEX_ATTENTION=false TEST_INSTRUMENTATION=false TEST_INDUCTOR=false TEST_SGLANG=false +INSTALL_SGLANG=false TEST_LIGER=false +INSTALL_LIGER=false TEST_VLLM=false INSTALL_VLLM=false -INSTALL_SGLANG=false TEST_TRITON_KERNELS=false VENV=false TRITON_TEST_REPORTS=false @@ -192,13 +194,13 @@ while (( $# != 0 )); do TEST_DEFAULT=false shift ;; - --install-sglang) - INSTALL_SGLANG=true + --sglang) + TEST_SGLANG=true TEST_DEFAULT=false shift ;; - --sglang) - TEST_SGLANG=true + --install-sglang) + INSTALL_SGLANG=true TEST_DEFAULT=false shift ;; @@ -207,6 +209,11 @@ while (( $# != 0 )); do TEST_DEFAULT=false shift ;; + --install-liger) + INSTALL_LIGER=true + TEST_DEFAULT=false + shift + ;; --vllm) TEST_VLLM=true TEST_DEFAULT=false From aedede21e23b70dda0e000f2bc51bc8075b01882 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 14:27:12 +0000 Subject: [PATCH 14/16] Removed liger patch --- scripts/test-triton.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 5899628fdf..8fc375c548 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -651,12 +651,6 @@ run_liger_install() { if ! [ -d "./Liger-Kernel" ]; then git clone https://github.com/linkedin/Liger-Kernel - - # There is probably an issue with cache - # Will try to upstream the patch here: - # https://github.com/linkedin/Liger-Kernel/pull/917 - # After merging we can remove this patch application - git apply benchmarks/third_party/liger/liger-fix.patch fi if ! pip list | grep "liger_kernel" ; then From b3680ad06996801489e8febbcfa632cc40444c5f Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Oct 2025 15:52:53 +0000 Subject: [PATCH 15/16] Debug --- scripts/test-triton.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh index 8fc375c548..84811ae6a1 100755 --- a/scripts/test-triton.sh +++ b/scripts/test-triton.sh @@ -666,7 +666,7 @@ run_liger_tests() { run_liger_install run_test_deps_install - run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} Liger-Kernel/test/ + run_pytest_command -vvv Liger-Kernel/test/ } run_vllm_install() { From 051a83d2c543f6e83c476fc1c7c8c46d90069da8 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 27 Oct 2025 11:42:01 +0000 Subject: [PATCH 16/16] Fixed installation --- .github/workflows/third-party-benchmarks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/third-party-benchmarks.yml b/.github/workflows/third-party-benchmarks.yml index 866ec24cc7..59a1a9158f 100644 --- a/.github/workflows/third-party-benchmarks.yml +++ b/.github/workflows/third-party-benchmarks.yml @@ -104,11 +104,11 @@ jobs: python transform_results.py $REPORTS/moe-gemm-performance.csv $REPORTS/moe-gemm-fp8-report.csv --tag $TAG --benchmark moe-fp8-benchmark - name: Run Liger-Kernel benchmarks - if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'liger-kernel')) }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'liger')) }} run: | source ./scripts/capture-hw-details.sh - ./scripts/test-triton.sh --install-vllm --skip-pip-install --skip-pytorch-install + ./scripts/test-triton.sh --install-liger --skip-pip-install --skip-pytorch-install # To remember return code, but still copy results RET_CODE=0