Fix compile tests

sarckk · ProExpertProg · commit 30df952e1521 · 2025-08-18T17:27:05.000-04:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
@@ -12,9 +12,9 @@
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import (ignore_torch_compile,
                                          support_torch_compile)
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
-from vllm.forward_context import set_forward_context
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -164,16 +164,33 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 @torch.inference_mode
-def run_model(vllm_config, model: nn.Module, inputs: torch.Tensor):
+def run_model(vllm_config: VllmConfig, model: nn.Module, inputs: torch.Tensor,
+              cudagraph_runtime_mode: CUDAGraphMode):
     with set_forward_context({}, vllm_config=vllm_config):
-        # First run is for compile
+        # warmup for the model with cudagraph_mode NONE
         model(inputs)
 
-        # Run CUDAGraph captured sizes
-        model(inputs[:2])
-        model(inputs[:1])
-
-        output = model(inputs[:2])
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(inputs[:2])
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(inputs[:1])
+
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(inputs[:2])
 
         output = output.cpu()
         return output.cpu()
@@ -189,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
         splitting_ops=["silly.attention"],
         cudagraph_capture_sizes=[1, 2],
     ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
         model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -211,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             num_cudagraph_captured=8,
             # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # no compile or cudagraph
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.NO_COMPILATION, ))
+    cudagraph_runtime_mode = CUDAGraphMode.NONE
 
     with set_current_vllm_config(vllm_config):
         model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -230,14 +250,16 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             num_backend_compilations=0,
             num_cudagraph_captured=0,
     ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # piecewise compile without CUDA graph
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE,
         use_cudagraph=False,
         splitting_ops=["silly.attention"],
     ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
         model = SimpleModelWithTwoGraphs(mlp_size=MLP_SIZE,
@@ -252,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             num_backend_compilations=4,
             num_cudagraph_captured=0,  # no cudagraph captured
     ):
-        outputs.append(run_model(vllm_config, model, inputs))
+        outputs.append(
+            run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
     # Generally don't expect outputs with and without inductor
     # to be bitwise equivalent
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
@@ -8,8 +8,8 @@
 from vllm.compilation.decorators import (ignore_torch_compile,
                                          support_torch_compile)
 from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
-                         VllmConfig, set_current_vllm_config)
-from vllm.forward_context import set_forward_context
+                         CUDAGraphMode, VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -40,6 +40,39 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
 )
 
 
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module,
+              cudagraph_runtime_mode: CUDAGraphMode):
+    with set_forward_context({}, vllm_config=vllm_config):
+        # warmup for the model with cudagraph_mode NONE
+        model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(torch.randn(2, MLP_SIZE).cuda())
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(torch.randn(1, MLP_SIZE).cuda())
+
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(torch.randn(2, MLP_SIZE).cuda())
+
+        output = output.cpu()
+        return output.cpu()
+
+
 def test_ignore_torch_compile_decorator():
     # piecewise
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
@@ -48,6 +81,7 @@ def test_ignore_torch_compile_decorator():
         splitting_ops=["silly.attention"],
         cudagraph_capture_sizes=[1, 2],
     ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     @support_torch_compile
     class A(nn.Module):
@@ -86,12 +120,8 @@ class C(B):
             num_backend_compilations=2,
             num_cudagraph_captured=4,
             # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        # first run is for compile
-        mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        # run cudagraph captured sizes
-        mod_A(torch.randn(2, MLP_SIZE).cuda())
-        mod_A(torch.randn(1, MLP_SIZE).cuda())
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
     with set_current_vllm_config(vllm_config):
         mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
@@ -103,10 +133,8 @@ class C(B):
             num_piecewise_capturable_graphs_seen=0,
             num_backend_compilations=0,
             num_cudagraph_captured=0,
-    ), set_forward_context({}, vllm_config=vllm_config):
-        mod_B(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        mod_B(torch.randn(2, MLP_SIZE).cuda())
-        mod_B(torch.randn(1, MLP_SIZE).cuda())
+    ):
+        run_model(vllm_config, mod_B, cudagraph_runtime_mode)
 
     with set_current_vllm_config(vllm_config):
         mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
@@ -119,10 +147,8 @@ class C(B):
             num_backend_compilations=2,
             num_cudagraph_captured=4,
             # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        mod_C(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        mod_C(torch.randn(2, MLP_SIZE).cuda())
-        mod_C(torch.randn(1, MLP_SIZE).cuda())
+    ):
+        run_model(vllm_config, mod_C, cudagraph_runtime_mode)
 
 
 # Only enable torch.compile if
@@ -180,6 +206,7 @@ def test_conditional_compile_enable_if():
                                  splitting_ops=["silly.attention"],
                                  cudagraph_capture_sizes=[1, 2],
                              ))
+    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
     with set_current_vllm_config(vllm_config):
         mod_A = A(vllm_config=vllm_config, prefix='').eval().cuda()
@@ -195,12 +222,8 @@ def test_conditional_compile_enable_if():
             num_backend_compilations=4,
             num_cudagraph_captured=8,
             # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        # first run is for compile
-        mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        # run cudagraph captured sizes
-        mod_A(torch.randn(2, MLP_SIZE).cuda())
-        mod_A(torch.randn(1, MLP_SIZE).cuda())
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
     # Set kv_sharing_fast_prefill=False
     # which will cause A to be compiled and B to not be compiled
@@ -224,9 +247,5 @@ def test_conditional_compile_enable_if():
             num_backend_compilations=4,
             num_cudagraph_captured=8,
             # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-        # first run is for compile
-        mod_A(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
-        # run cudagraph captured sizes
-        mod_A(torch.randn(2, MLP_SIZE).cuda())
-        mod_A(torch.randn(1, MLP_SIZE).cuda())
+    ):
+        run_model(vllm_config, mod_A, cudagraph_runtime_mode)