intel
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 1 addition & 1 deletion b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 2 additions & 2 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 2 additions & 2 deletions
@@ -473,7 +473,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
             w_tri = convert_layout(w_tri, w_layout, **w_layout_opts)
             w_scale_tri = convert_layout(w_scale_tri, w_scale_layout, **w_scale_layout_opts)
         else:
-            if torch.cuda.get_device_capability()[0] < 10:
+            if is_cuda() and torch.cuda.get_device_capability()[0] < 10:
                 pytest.skip("transposed mxfp weight not supported with cuda capability < 10")
             if block_m == 16:
                 pytest.skip("PassManager::run failed from Triton compiler")
 
@@ -486,7 +486,7 @@ def matmul_ogs(x, w, bias,
         # TODO: remove this code path; using uint8 for mxfp4 weight will bite us when we want to support uint8 for real
         dtype = FP4 if w.dtype == torch.uint8 else w.dtype
         w = wrap_torch_tensor(w, dtype=dtype)
-    if w_has_mx and (torch.cuda.get_device_capability()[0] < 10 or w.storage.layout is not None and not isinstance(w.storage.layout, StridedLayout)):
+    if w_has_mx and is_cuda() and (torch.cuda.get_device_capability()[0] < 10 or w.storage.layout is not None and not isinstance(w.storage.layout, StridedLayout)):
         assert w.stride(-2) == 1, "`w` must be column-major when it has data-type mxfp and (swizzled or not on >=Blackwell)"
     if w_scale is not None and not isinstance(w_scale, Tensor):
         w_scale = Tensor(w_scale)
@@ -534,7 +534,7 @@ def matmul_ogs(x, w, bias,
     )
     has_gather_tma = has_gather and target_info.has_tma_gather()
     # hopper w/ mxfp4 doesn't support TMA
-    can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
+    can_use_tma = can_use_tma and  is_cuda() and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
     can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1)
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
         batch_size, M, N, w.shape[-2], routing_data,