sgl-project · bhargaveede · Nov 5, 2025 · Nov 7, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -41,8 +41,13 @@ FetchContent_Declare(
     GIT_TAG        5a0b7a8b7024175f223f4a47535650f317bcbbf3
     GIT_SHALLOW    OFF
 )
-FetchContent_MakeAvailable(repo-cutlass-sycl)
 
+set(FETCHCONTENT_MAKEAVAILABLE_SERIAL FALSE)
+FetchContent_MakeAvailable(repo-cutlass-sycl)
+file(COPY ${repo-cutlass-sycl_SOURCE_DIR}/cmake/onemkl.cmake
+     DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+set(FETCHCONTENT_MAKEAVAILABLE_SERIAL TRUE)
+FetchContent_MakeAvailable(repo-cutlass-sycl)
 
 include_directories(
     ${CMAKE_CURRENT_SOURCE_DIR}/include

diff --git a/include/sgl_kernel_ops.h b/include/sgl_kernel_ops.h
@@ -167,7 +167,7 @@ torch::Tensor fp8_scaled_mm(
     const torch::Tensor& mat_b,
     const torch::Tensor& scales_a,
     const torch::Tensor& scales_b,
-    const torch::Dtype& out_dtype,
+    const at::ScalarType out_dtype,
     const c10::optional<torch::Tensor>& bias);
 torch::Tensor fp8_blockwise_scaled_mm(
     const torch::Tensor& mat_a,

diff --git a/python/sgl_kernel/gemm.py b/python/sgl_kernel/gemm.py
@@ -50,16 +50,16 @@ def _bmm_fp8_internal(
     A_scale: torch.Tensor,
     B_scale: torch.Tensor,
 ) -> None:
-    cublas_handle = torch.cuda.current_blas_handle()
+    # cublas_handle = torch.cuda.current_blas_handle()
     torch.ops.sgl_kernel.bmm_fp8.default(
         A,
         B,
         D,
         A_scale,
         B_scale,
         workspace_buffer,
-        cublas_handle,
-        get_cuda_stream(),
+        0,
+        0,
     )