diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index e658990e95e5..02e1d1f1fd02 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -11,7 +11,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( extract_required_args, moe_kernel_quantize_input) -from vllm.utils.flashinfer import block_scale_interleave +from vllm.utils.flashinfer import nvfp4_block_scale_interleave def get_local_sizes(local_tokens): @@ -92,7 +92,7 @@ def prepare( dim=0, sizes=get_local_sizes(local_tokens)) a1_m, a1_n = a1q.shape - a1q_scale = block_scale_interleave(a1q_scale) + a1q_scale = nvfp4_block_scale_interleave(a1q_scale) return a1q, a1q_scale, None, topk_ids, topk_weights diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 460334d77f0a..81611ed07aaa 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1254,8 +1254,8 @@ def apply( x, layer.w13_weight, layer.w2_weight), ( "Flashinfer CUTLASS Fused MoE not applicable!") - a1_gscale = torch.min(layer.w13_input_scale_quant) - a2_gscale = torch.min(layer.w2_input_scale_quant) + a1_gscale = layer.w13_input_scale_quant + a2_gscale = layer.w2_input_scale_quant extra_expert_args = { 'g1_alphas': layer.g1_alphas, 'g2_alphas': layer.g2_alphas, diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 1ddafbae7fc0..b25e3a49f181 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -69,8 +69,8 @@ def wrapper(*args, **kwargs): flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe", "cutlass_fused_moe") fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize") -block_scale_interleave = _lazy_import_wrapper("flashinfer", - "block_scale_interleave") +nvfp4_block_scale_interleave = _lazy_import_wrapper( + "flashinfer", "nvfp4_block_scale_interleave") # Special case for autotune since it returns a context manager autotune = _lazy_import_wrapper( @@ -95,7 +95,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: required_functions = [ ("flashinfer.fused_moe", "cutlass_fused_moe"), ("flashinfer", "fp4_quantize"), - ("flashinfer", "block_scale_interleave"), + ("flashinfer", "nvfp4_block_scale_interleave"), ] for module_name, attr_name in required_functions: @@ -110,7 +110,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: "flashinfer_trtllm_fp8_block_scale_moe", "flashinfer_cutlass_fused_moe", "fp4_quantize", - "block_scale_interleave", + "nvfp4_block_scale_interleave", "autotune", "has_flashinfer_moe", "has_flashinfer_cutlass_fused_moe",