Fall back to full prefill if prompt_logprobs set

sarckk · sarckk · commit 0d5a4426c02a · 2025-08-20T15:19:10.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -69,6 +69,7 @@ class CommonAttentionMetadata:
 
     logits_indices_padded: Optional[torch.Tensor] = None
     num_logits_indices: Optional[int] = None
+    prompt_logprobs: Optional[bool] = None
 
     causal: bool = True
 
@@ -836,13 +837,25 @@ def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False) -> AttentionMetadata:
-        new_common_attn_metadata =\
-        make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
+        # Either not set (None) or prompt_logprobs is False
+        if not common_attn_metadata.prompt_logprobs:
+            # Fast prefill path
+            new_common_attn_metadata =\
+            make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
+            metadata = super(self.__class__,
+                             self).build(common_prefix_len,
+                                         new_common_attn_metadata, fast_build)
+            return create_kv_sharing_fast_prefill_attn_metadata_subclass(
+                metadata, common_attn_metadata)
+
+        # Default path:
+        # Either --kv-sharing-fast-prefill is not set or at least one request
+        # in the current scheduling round requests logprobs for prompt tokens
+        # which is not compatible with fast prefill
         metadata = super(self.__class__,
-                         self).build(common_prefix_len,
-                                     new_common_attn_metadata, fast_build)
-        return create_kv_sharing_fast_prefill_attn_metadata_subclass(
-            metadata, common_attn_metadata)
+                         self).build(common_prefix_len, common_attn_metadata,
+                                     fast_build)
+        return metadata
 
     # Dynamically create a new attention backend that wraps the
     # underlying attention backend but applies
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -867,11 +867,11 @@ def _prepare_inputs(
 
         if (self.cache_config.kv_sharing_fast_prefill
                 and self.input_batch.num_prompt_logprobs):
-            raise RuntimeError(
+            logger.warning(
                 "Encountered at least one request with prompt_logprobs set "
                 "with --kv-sharing-fast-prefill enabled. Fast prefill doesn't "
-                "produce correct logits for prompt tokens. Please try again "
-                "without the flag --kv-sharing-fast-prefill set.")
+                "produce correct logits for prompt tokens, so fast prefill will"
+                " be disabled for this iteration.")
 
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
@@ -900,6 +900,7 @@ def _prepare_inputs(
                 slot_mapping=slot_mapping,
                 logits_indices_padded=logits_indices_padded,
                 num_logits_indices=logits_indices.size(0),
+                prompt_logprobs=len(self.input_batch.num_prompt_logprobs) > 0,
                 causal=True,
             )