(wip) async scheduling + spec decode

zixi-qi · zixi-qi · commit 9a6f64095e8c · 2025-08-05T09:23:53.000-07:00
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
@@ -68,6 +68,8 @@ def parse_args():
     parser.add_argument("--model-dir", type=str, default=None)
     parser.add_argument("--eagle-dir", type=str, default=None)
     parser.add_argument("--custom-mm-prompts", action="store_true")
+    parser.add_argument("--no-spec-decode", action="store_true")
+    parser.add_argument("--async-scheduling", action="store_true")
     return parser.parse_args()
 
 
@@ -127,11 +129,12 @@ def main():
         enable_chunked_prefill=args.enable_chunked_prefill,
         enforce_eager=args.enforce_eager,
         gpu_memory_utilization=0.8,
-        speculative_config=speculative_config,
+        speculative_config=speculative_config if not args.no_spec_decode else None,
         disable_log_stats=False,
         max_model_len=16384,
         limit_mm_per_prompt={"image": 5},
         disable_chunked_mm_input=True,
+        async_scheduling=args.async_scheduling,
     )
 
     sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1194,13 +1194,6 @@ def create_engine_config(
                 raise ValueError("Async scheduling is not supported with "
                                  "pipeline-parallel-size > 1.")
 
-            # Currently, async scheduling does not support speculative decoding.
-            # TODO(woosuk): Support it.
-            if self.speculative_config is not None:
-                raise ValueError(
-                    "Currently, speculative decoding is not supported with "
-                    "async scheduling.")
-
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
@@ -23,7 +23,8 @@ def _update_after_schedule(
             if (request.num_computed_tokens == request.num_tokens_with_spec +
                     request.num_output_placeholders):
                 # The request will generate a new token in this scheduling step.
-                request.num_output_placeholders = 1 + len(request.spec_token_ids)
+                request.num_output_placeholders = 1 + len(
+                    request.spec_token_ids)
 
     def _update_request_with_output(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -332,6 +332,11 @@ def __init__(
 
         self.reorder_batch_threshold: Optional[int] = None
 
+        # Cache spec token ids and num rejected tokens from previous round,
+        # used when async scheduling and spec decoding are both enabled
+        self.cached_spec_token_ids = {}
+        self.cached_num_rejected_tokens = {}
+
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
         Update the order of requests in the batch based on the attention
@@ -381,6 +386,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
             self.encoder_cache.pop(req_id, None)
+            self.cached_spec_token_ids.pop(req_id, None)
+            self.cached_num_rejected_tokens.pop(req_id, None)
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
         # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -494,6 +501,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
+            if req_id in self.cached_num_rejected_tokens:
+                num_computed_tokens -= self.cached_num_rejected_tokens[req_id]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_data.resumed_from_preemption[i]
 
@@ -554,8 +563,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 self.input_batch.num_tokens[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
-            spec_token_ids = (
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
+            if req_id in self.cached_spec_token_ids:
+                spec_token_ids = self.cached_spec_token_ids[req_id]
+            else:
+                spec_token_ids = (
+                    scheduler_output.scheduled_spec_decode_tokens.get(
+                        req_id, ()))
             if spec_token_ids:
                 num_spec_tokens = len(spec_token_ids)
                 start_index = self.input_batch.num_tokens_no_spec[req_index]
@@ -1743,6 +1756,13 @@ def execute_model(
 
         self.eplb_step()
 
+        if self.speculative_config and self.scheduler_config.async_scheduling:
+            assert spec_token_ids
+            for idx, req_id in enumerate(self.input_batch.req_ids):
+                self.cached_spec_token_ids[req_id] = spec_token_ids[idx]
+                self.cached_num_rejected_tokens[req_id] = max_gen_len - len(
+                    valid_sampled_token_ids[idx])
+
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,