further fixes

zixi-qi · zixi-qi · commit ad1b48828a95 · 2025-08-07T17:17:46.000-07:00
Signed-off-by: qizixi &lt;qizixi@meta.com&gt;
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
@@ -23,8 +23,7 @@ def _update_after_schedule(
             if (request.num_computed_tokens == request.num_tokens_with_spec +
                     request.num_output_placeholders):
                 # The request will generate a new token in this scheduling step.
-                request.num_output_placeholders = 1 + len(
-                    request.spec_token_ids)
+                request.num_output_placeholders = 1 + self.num_spec_tokens
 
     def _update_request_with_output(
         self,
@@ -37,7 +36,5 @@ def _update_request_with_output(
 
         # Cache the new tokens. Preempted requests should be skipped.
         if status_before_update == RequestStatus.RUNNING:
-            self.kv_cache_manager.cache_blocks(
-                request,
-                request.num_computed_tokens - request.num_output_placeholders)
+            self.kv_cache_manager.cache_blocks(request, request.num_tokens)
         return new_token_ids, stopped
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -204,9 +204,12 @@ def schedule(self) -> SchedulerOutput:
         while req_index < len(self.running) and token_budget > 0:
             request = self.running[req_index]
 
-            num_new_tokens = (request.num_tokens_with_spec +
-                              request.num_output_placeholders -
-                              request.num_computed_tokens)
+            if request.num_output_placeholders:
+                num_new_tokens = request.num_output_placeholders
+            else:
+                num_new_tokens = (request.num_tokens_with_spec -
+                                  request.num_computed_tokens)
+
             if (0 < self.scheduler_config.long_prefill_token_threshold <
                     num_new_tokens):
                 num_new_tokens = (
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -332,10 +332,10 @@ def __init__(
 
         self.reorder_batch_threshold: Optional[int] = None
 
-        # Cache spec token ids and num rejected tokens from previous round,
+        # Cache spec token ids and num computed tokens from previous round,
         # used when async scheduling and spec decoding are both enabled
-        self.cached_spec_token_ids = {}
-        self.cached_num_rejected_tokens = {}
+        self.cached_spec_token_ids: dict[str, list[int]] = {}
+        self.cached_num_computed_tokens: dict[str, int] = {}
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
@@ -387,7 +387,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             self.requests.pop(req_id, None)
             self.encoder_cache.pop(req_id, None)
             self.cached_spec_token_ids.pop(req_id, None)
-            self.cached_num_rejected_tokens.pop(req_id, None)
+            self.cached_num_computed_tokens.pop(req_id, None)
+
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
         # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -500,9 +501,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         req_data = scheduler_output.scheduled_cached_reqs
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
-            num_computed_tokens = req_data.num_computed_tokens[i]
-            if req_id in self.cached_num_rejected_tokens:
-                num_computed_tokens -= self.cached_num_rejected_tokens[req_id]
+            if req_id in self.cached_spec_token_ids:
+                scheduler_output.scheduled_spec_decode_tokens[
+                    req_id] = self.cached_spec_token_ids[req_id]
+            if req_id in self.cached_num_computed_tokens:
+                num_computed_tokens = self.cached_num_computed_tokens[req_id]
+            else:
+                num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_data.resumed_from_preemption[i]
 
@@ -563,12 +568,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 self.input_batch.num_tokens[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
-            if req_id in self.cached_spec_token_ids:
-                spec_token_ids = self.cached_spec_token_ids[req_id]
-            else:
-                spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens.get(
-                        req_id, ()))
+            spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
             if spec_token_ids:
                 num_spec_tokens = len(spec_token_ids)
                 start_index = self.input_batch.num_tokens_no_spec[req_index]
@@ -1760,8 +1761,16 @@ def execute_model(
             assert spec_token_ids
             for idx, req_id in enumerate(self.input_batch.req_ids):
                 self.cached_spec_token_ids[req_id] = spec_token_ids[idx]
-                self.cached_num_rejected_tokens[req_id] = max_gen_len - len(
+                num_rejected_tokens = max_gen_len - len(
                     valid_sampled_token_ids[idx])
+                if req_id not in self.cached_num_computed_tokens:
+                    self.cached_num_computed_tokens[
+                        req_id] = scheduler_output.num_scheduled_tokens[
+                            req_id] - num_rejected_tokens
+                else:
+                    self.cached_num_computed_tokens[
+                        req_id] += scheduler_output.num_scheduled_tokens[
+                            req_id] - num_rejected_tokens
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,