Merge branch 'main' into dev/pralay-das/cutlass_mla_get_workspace_size

pralay-das · pralay-das · commit 7b210ff4112a · 2025-11-10T08:22:26.000Z
diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
@@ -51,6 +51,12 @@ jobs:
           docker exec -w /root/sglang ci_sglang_xpu \
             /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/tests &&  python3 -m pytest -v -s test_awq_dequant.py test_topk_softmax.py test_flash_attention.py"
 
+      - name: Run Sglang Kernel Benchmarks
+        timeout-minutes: 20
+        run: |
+          docker exec -w /root/sglang ci_sglang_xpu \
+            /bin/bash -c "cd /root/sglang/sgl-kernel-xpu/benchmark &&  python3 bench_flash_attn.py "
+
       - name: Run E2E Bfloat16 tests
         timeout-minutes: 20
         run: |
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,4 @@
 *.pyo
 
 build
+.vscode/
diff --git a/Dockerfile.xpu_kernel b/Dockerfile.xpu_kernel
@@ -4,7 +4,7 @@
 
 
 # Set default Ubuntu version to 24.04
-FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04
+FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -38,20 +38,7 @@ RUN --mount=type=secret,id=github_token \
     conda activate py${PYTHON_VERSION} && \
     # . /opt/intel/oneapi/setvars.sh --force && \
     # Install Torch
-    pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
-
-# Install vllm from source
-RUN --mount=type=secret,id=github_token \
-    cd /root && \
-    . /miniforge3/bin/activate && \
-    conda activate py${PYTHON_VERSION} && \
-    echo "Building vllm/sglang from source ..." && \
-    git clone https://github.com/zhuyuhua-v/vllm.git && \
-    cd vllm && \
-    git checkout yuhua/deepseek && \
-    pip install setuptools_scm --root-user-action=ignore && \
-    pip install setuptools==75.6.0 packaging==24.2 --root-user-action=ignore && \
-    VLLM_TARGET_DEVICE=xpu python setup.py install
+    pip install torch==2.9.0 torchvision torchaudio pytorch-triton-xpu==3.5.0 --index-url https://download.pytorch.org/whl/xpu
 
 # Install SGlang from source
 RUN --mount=type=secret,id=github_token \
@@ -62,14 +49,13 @@ RUN --mount=type=secret,id=github_token \
     echo "cloning ${SG_LANG_BRANCH} from ${SG_LANG_REPO}" && \
     git clone --branch ${SG_LANG_BRANCH} --single-branch ${SG_LANG_REPO} && \
     cd sglang && \
-    pip install -e "python[all_xpu]" --root-user-action=ignore && \
     # Clone sgl-kernel and build sglang-kernel...
     echo "cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
     git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
     cd sgl-kernel-xpu && \
     pip install -v . &&\
     # Install required packages for sglang workloads
-    pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
+    pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops matplotlib pandas --root-user-action=ignore && \
     conda install libsqlite=3.48.0 -y && \
     echo ". /miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /root/" >> /root/.bashrc;
 
diff --git a/benchmark/bench_flash_attn.py b/benchmark/bench_flash_attn.py
@@ -0,0 +1,161 @@
+from itertools import product
+
+import torch
+import triton
+from sgl_kernel.flash_attn import flash_attn_with_kvcache
+
+
+def flash_attn_baseline(
+    q,
+    k_cache,
+    v_cache,
+    causal,
+    window_size,
+    softmax_scale,
+    softmax_sink,
+    cache_seqlens,
+    page_table,
+    cu_seqlens_q,
+    max_seqlen_q,
+):
+    """Baseline Flash Attention implementation"""
+    out, lse, *rest = flash_attn_with_kvcache(
+        q,
+        k_cache,
+        v_cache,
+        causal=causal,
+        softmax_sink=softmax_sink,
+        window_size=window_size,
+        softmax_scale=softmax_scale,
+        page_table=page_table,
+        cache_seqlens=cache_seqlens,
+        cu_seqlens_q=cu_seqlens_q,
+        max_seqlen_q=max_seqlen_q,
+        return_softmax_lse=True,
+    )
+    return out, lse
+
+
+# Benchmark configurations
+causal = [True, False]
+local = [True, False]
+use_softmax_sink = [True, False]
+batch_size = [1, 16]
+q_seq_length_range = [1, 512, 1024]
+kv_seq_length_range = [512, 1024, 2048, 4096, 8192, 16384]
+page_size_range = [32, 64, 128]
+configs = list(
+    filter(
+        lambda cfg: not (cfg[0] and cfg[1]),
+        product(
+            causal,
+            local,
+            use_softmax_sink,
+            batch_size,
+            q_seq_length_range,
+            kv_seq_length_range,
+            page_size_range,
+        ),
+    )
+)
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=[
+            "causal",
+            "local",
+            "use_softmax_sink",
+            "batch_size",
+            "q_seq_length",
+            "kv_seq_length",
+            "page_size",
+        ],
+        x_vals=[list(c) for c in configs],
+        line_arg="provider",
+        line_vals=["flash_attn"],
+        line_names=["Flash Attention"],
+        styles=[("blue", "-")],
+        ylabel="us",
+        plot_name="flash-attention-performance",
+        args={},
+    )
+)
+def benchmark(
+    causal,
+    local,
+    use_softmax_sink,
+    batch_size,
+    q_seq_length,
+    kv_seq_length,
+    page_size,
+    provider,
+):
+    dtype = torch.bfloat16
+    device = torch.device("xpu")
+
+    # Attention parameters
+    num_heads = 16
+    head_dim = 64
+
+    # Create input tensors
+    q = torch.randn(
+        (batch_size * q_seq_length, num_heads, head_dim), device=device, dtype=dtype
+    )
+    num_pages = (batch_size * kv_seq_length + page_size - 1) // page_size
+    k_cache = torch.randn(
+        (num_pages, page_size, num_heads, head_dim), device=device, dtype=dtype
+    )
+    v_cache = torch.randn(
+        (num_pages, page_size, num_heads, head_dim), device=device, dtype=dtype
+    )
+    cache_seqlens = (
+        torch.ones(batch_size, device=device, dtype=torch.int32) * kv_seq_length
+    )
+    page_table = (
+        torch.randperm(num_pages, device=device, dtype=torch.int32)
+        .reshape(batch_size, -1)
+        .contiguous()
+    )
+    cu_seqlens_q = torch.arange(
+        0,
+        (batch_size + 1) * q_seq_length,
+        step=q_seq_length,
+        device=device,
+        dtype=torch.int32,
+    )
+    max_seqlen_q = q_seq_length
+    window_size = (-1, -1) if not local else torch.randint(0, kv_seq_length, (2,))
+
+    softmax_sink = (
+        torch.randn(num_heads, device=device, dtype=dtype) if use_softmax_sink else None
+    )
+
+    softmax_scale = 1.0 / (head_dim**0.5)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "flash_attn":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: flash_attn_baseline(
+                q.clone(),
+                k_cache.clone(),
+                v_cache.clone(),
+                causal=causal,
+                window_size=window_size,
+                softmax_scale=softmax_scale,
+                softmax_sink=softmax_sink,
+                cache_seqlens=cache_seqlens,
+                page_table=page_table,
+                cu_seqlens_q=cu_seqlens_q,
+                max_seqlen_q=max_seqlen_q,
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    benchmark.run(print_data=True)
+    print("Benchmark finished!")
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -73,7 +73,13 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz)
   set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-sycl-instrument-device-code)
   set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Xspirv-translator)
-  set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -spirv-ext=+SPV_INTEL_split_barrier) #,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate)
+
+  # SYCL compiler in basekit after 2025.2 needs more spirv arguments.
+  if(SYCL_COMPILER_VERSION GREATER_EQUAL 20250806)
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate)
+  else()
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -spirv-ext=+SPV_INTEL_split_barrier)
+  endif()
 
   if(CMAKE_BUILD_TYPE MATCHES Debug)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -g -O0 -Rno-debug-disables-optimization)
diff --git a/include/sgl_flash_kernel_ops.h b/include/sgl_flash_kernel_ops.h
@@ -48,12 +48,11 @@ std::vector<at::Tensor> mha_fwd(
                           // h_k, d) if there is page_table.
     const at::Tensor& v,  // (b_k, s_k, h_k, dv) or (total_k, h_k, dv) if there is cu_seqlens_k or (num_pages,
                           // page_size, h_k, dv) if there is page_table.
-    std::optional<const at::Tensor>& q_v_,           // (b, s_q, h, dv) or (total_q_new, h, dv) if there is cu_seqlens_q
-    std::optional<const at::Tensor>& cu_seqlens_q_,  // b+1
-    std::optional<const at::Tensor>& cu_seqlens_k_,  // b+1
-    std::optional<int> max_seqlen_q_,
-    std::optional<int> max_seqlen_k_,
-    std::optional<const at::Tensor>& page_table_,      // (b_k, max_num_pages_per_seq)
+    std::optional<const at::Tensor>& q_v_,  // (b, s_q, h, dv) or (total_q_new, h, dv) if there is cu_seqlens_q
+    const at::Tensor& cu_seqlens_q,         // b+1
+    const at::Tensor& cu_seqlens_k,         // b+1
+    int max_seqlen_q,
+    const at::Tensor& page_table,
     std::optional<const at::Tensor>& kv_batch_idx_,    // b. indices to index into the KV cache
     std::optional<const at::Tensor>& leftpad_k_,       // b
     std::optional<const at::Tensor>& rotary_cos_,      // seqlen_ro x (rotary_dim / 2)
diff --git a/python/sgl_kernel/flash_attn.py b/python/sgl_kernel/flash_attn.py
@@ -179,7 +179,6 @@ def flash_attn_with_kvcache(
         max_seqlen_q = q.size(1)
         q = q.view(-1, q.size(-2), q.size(-1)).contiguous()
     if cache_seqlens is not None:
-        max_seqlen_k = cache_seqlens.max().item()
         assert cache_seqlens.size(0) + 1 == cu_seqlens_q.size(0)
         cu_seqlens_k = torch.concat(
             (
@@ -196,7 +195,6 @@ def flash_attn_with_kvcache(
         cu_seqlens_q,
         cu_seqlens_k,
         max_seqlen_q,
-        max_seqlen_k,
         page_table,
         cache_batch_idx,
         cache_leftpad,
diff --git a/src/sycl/chunked_prefill.cpp b/src/sycl/chunked_prefill.cpp
diff --git a/src/torch_extension_sycl.cc b/src/torch_extension_sycl.cc

Original file line number	Diff line number	Diff line change
`@@ -45,3 +45,4 @@`
`45`	`45`	`*.pyo`
`46`	`46`
`47`	`47`	`build`
	`48`	`+.vscode/`