heheda12345 · luccafong · Sep 30, 2025
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
@@ -39,6 +39,8 @@
 class SpeculativeConfig:
     """Configuration for speculative decoding."""
 
+    """Override the default enforce_eager from model_config"""
+    enforce_eager: bool = False
     # General speculative decoding control
     num_speculative_tokens: SkipValidation[int] = None  # type: ignore
     """The number of speculative tokens, if provided. It will default to the
@@ -208,6 +210,10 @@ def __post_init__(self):
                         "mimo","ernie4_5_moe", "qwen3_next")):
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model
+                if self.target_model_config.hf_text_config.model_type == "deepseek_v32":
+                    # FIXME(luccafong): cudgraph with v32 MTP is not supported, remove this when
+                    # the issue is fixed.
+                    self.enforce_eager = True
                 # Align the quantization of draft model for cases such as
                 # --quantization fp8 with a bf16 checkpoint.
                 if not self.quantization:

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -59,6 +59,7 @@ def __init__(
     ):
         self.vllm_config = vllm_config
         self.speculative_config = vllm_config.speculative_config
+        assert self.speculative_config is not None
         self.draft_model_config = self.speculative_config.draft_model_config
         self.method = self.speculative_config.method
 
@@ -82,7 +83,9 @@ def __init__(
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE and
-                               not self.vllm_config.model_config.enforce_eager)
+                               not self.vllm_config.model_config.enforce_eager
+                               and not self.speculative_config.enforce_eager
+        )
         self.cudagraph_batch_sizes = list(
             reversed(
                 self.vllm_config.compilation_config.cudagraph_capture_sizes))