refactor async_scheduler to keep num_output_placeholders constant

zixi-qi · zixi-qi · commit e9b639d0fafd · 2025-08-04T14:47:58.000-07:00
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
@@ -20,11 +20,10 @@ def _update_after_schedule(
         super()._update_after_schedule(scheduler_output)
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
-            if (request.num_computed_tokens == request.num_tokens +
+            if (request.num_computed_tokens == request.num_tokens_with_spec +
                     request.num_output_placeholders):
                 # The request will generate a new token in this scheduling step.
-                # TODO(woosuk): Support speculative decoding.
-                request.num_output_placeholders += 1
+                request.num_output_placeholders = 1 + len(request.spec_token_ids)
 
     def _update_request_with_output(
         self,
@@ -35,10 +34,6 @@ def _update_request_with_output(
         new_token_ids, stopped = super()._update_request_with_output(
             request, new_token_ids)
 
-        # Update the number of output placeholders.
-        request.num_output_placeholders -= len(new_token_ids)
-        assert request.num_output_placeholders >= 0
-
         # Cache the new tokens. Preempted requests should be skipped.
         if status_before_update == RequestStatus.RUNNING:
             self.kv_cache_manager.cache_blocks(