Cleaner code

sarckk · sarckk · commit a82257261bf9 · 2025-08-19T12:57:26.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -64,7 +64,8 @@ class CommonAttentionMetadata:
     block_table_tensor: torch.Tensor
     slot_mapping: torch.Tensor
 
-    logits_indices: Optional[torch.Tensor] = None
+    logits_indices_padded: Optional[torch.Tensor] = None
+    num_logits_indices: Optional[int] = None
 
     causal: bool = True
 
@@ -534,7 +535,6 @@ def make_local_attention_virtual_batches(
         max_query_len=seqlens_q_local.max(),
         block_table_tensor=block_table_local,
         slot_mapping=common_attn_metadata.slot_mapping,
-        logits_indices=common_attn_metadata.logits_indices,
         causal=True,
     )
 
@@ -547,14 +547,14 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
         # Skip computing fast prefill path
         return common_attn_metadata
 
-    if common_attn_metadata.logits_indices is None:
-        # Logits_indices can be None if prompt_logprobs is
-        # set for at least one request in the current iteration
-        # fast prefill is not compatible with prompt_logprobs
-        # so skip computing fast prefill path
+    if (common_attn_metadata.logits_indices_padded is None
+            or common_attn_metadata.num_logits_indices is None):
         return common_attn_metadata
 
-    logits_indices = common_attn_metadata.logits_indices
+    logits_indices_padded = common_attn_metadata.logits_indices_padded
+    num_logits_indices = common_attn_metadata.num_logits_indices
+    # Get rid of CUDAGraph padding, if any
+    logits_indices = logits_indices_padded[:num_logits_indices]
     num_reqs = common_attn_metadata.num_reqs
     query_start_loc = common_attn_metadata.query_start_loc
     seq_lens = common_attn_metadata.seq_lens
@@ -597,7 +597,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
         max_query_len=decode_max_query_len,
         block_table_tensor=common_attn_metadata.block_table_tensor,
         slot_mapping=common_attn_metadata.slot_mapping,
-        logits_indices=logits_indices,
         causal=True,
     )
     return common_attn_metadata
@@ -608,6 +607,9 @@ def subclass_attention_metadata_builder(
     builder_cls: type[AttentionMetadataBuilder[M]],
     build_preprocess_fn: Callable[[CommonAttentionMetadata],
                                   CommonAttentionMetadata],
+    build_postprocess_fn: Optional[
+        Callable[[AttentionMetadataBuilder[M], CommonAttentionMetadata, Any],
+                 Any]] = None,
 ) -> type[AttentionMetadataBuilder[M]]:
     """
     Return a new subclass of `builder_cls` whose .build(...) method
@@ -619,9 +621,13 @@ def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False):
-        return builder_cls.build(self, common_prefix_len,
-                                 build_preprocess_fn(common_attn_metadata),
-                                 fast_build)
+        metadata = builder_cls.build(self, common_prefix_len,
+                                     build_preprocess_fn(common_attn_metadata),
+                                     fast_build)
+        if build_postprocess_fn is not None:
+            metadata = build_postprocess_fn(self, common_attn_metadata,
+                                            metadata)
+        return metadata
 
     Wrapped = type(
         name,
@@ -800,25 +806,25 @@ class KVSharingFastPrefillAttentionMetadata(Protocol):
 
 
 def create_kv_sharing_fast_prefill_attn_metadata_subclass(
-    attn_metadata_i: Any,
-    logits_indices_padded: torch.Tensor,
-    num_logits_indices: int,
-):
+    self: AttentionMetadataBuilder[M],
+    common_attn_metadata: CommonAttentionMetadata,
+    metadata: Any,
+) -> Any:
     # Dynamically create a a dataclass type that inherits
     # from attention metadata type but includes additional
     # fields logits_indices_padded and num_logits_indices
     # which are required for prefill truncation
     fast_prefill_metadata_type = (
         make_kv_sharing_fast_prefill_attention_metadata(
-            metadata_cls=type(attn_metadata_i), ))  # type: ignore
+            metadata_cls=type(metadata), ))  # type: ignore
     # Avoid deepcopy caused by dict.asdict
     attn_metadata_fields = {}
-    for field in fields(attn_metadata_i.__class__):
-        attn_metadata_fields[field.name] = getattr(attn_metadata_i, field.name)
+    for field in fields(metadata.__class__):
+        attn_metadata_fields[field.name] = getattr(metadata, field.name)
     attn_metadata_i = fast_prefill_metadata_type(
         **attn_metadata_fields,
-        logits_indices_padded=logits_indices_padded,
-        num_logits_indices=num_logits_indices,
+        logits_indices_padded=common_attn_metadata.logits_indices_padded,
+        num_logits_indices=common_attn_metadata.num_logits_indices,
     )
     return attn_metadata_i
 
@@ -829,14 +835,19 @@ def create_custom_attention_backend(
     underlying_attn_backend: AttentionBackend,
     build_preprocess_fn: Callable[[CommonAttentionMetadata],
                                   CommonAttentionMetadata],
+    build_postprocess_fn: Optional[
+        Callable[[AttentionMetadataBuilder[M], CommonAttentionMetadata, Any],
+                 Any]] = None,
 ) -> type[AttentionBackend]:
     # Dynamically create a new attention backend that wraps the
     # underlying attention backend but applies
     # `build_preproces_fn` before calling `build(...)`
     builder_cls = subclass_attention_metadata_builder(
         name_prefix=prefix,
         builder_cls=underlying_attn_backend.get_builder_cls(),
-        build_preprocess_fn=build_preprocess_fn)
+        build_preprocess_fn=build_preprocess_fn,
+        build_postprocess_fn=build_postprocess_fn,
+    )
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
         attention_backend_cls=underlying_attn_backend,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -609,7 +609,6 @@ def prepare_inputs(
             max_query_len=new_query_len_per_req.max().item(),
             block_table_tensor=common_attn_metadata.block_table_tensor,
             slot_mapping=common_attn_metadata.slot_mapping[token_indices],
-            logits_indices=common_attn_metadata.logits_indices,
             causal=True,
         )
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -806,11 +806,11 @@ def _prepare_inputs(
 
         if (self.cache_config.kv_sharing_fast_prefill
                 and self.input_batch.num_prompt_logprobs):
-            logger.warning_once(
+            raise RuntimeError(
                 "Encountered at least one request with prompt_logprobs set "
                 "with --kv-sharing-fast-prefill enabled. Fast prefill doesn't "
-                "produce correct logits for prompt tokens, so fast prefill "
-                "will be disabled for scheduling rounds with prompt_logprobs.")
+                "produce correct logits for prompt tokens. Please try again "
+                "without the flag --kv-sharing-fast-prefill set.")
 
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
@@ -837,6 +837,8 @@ def _prepare_inputs(
                 max_query_len=max_num_scheduled_tokens,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
+                logits_indices_padded=logits_indices_padded,
+                num_logits_indices=logits_indices.size(0),
                 causal=True,
             )
 
@@ -857,34 +859,11 @@ def _prepare_inputs(
                         builder,
                     )
 
-                # If there is at least one request with prompt_logprobs set,
-                # we cannot enable this optimization as the logits of prompt
-                # tokens will no longer be valid when doing  fast prefill.
-                is_fast_prefill = (
-                    attn_group.layer_names[0]
-                    in self.kv_sharing_fast_prefill_eligible_layers
-                    and not self.input_batch.num_prompt_logprobs)
-                if is_fast_prefill:
-                    # If logits_indices is set, builder.build(...) will
-                    # preprocess the common metadata to skip prefill tokens
-                    common_attn_metadata.logits_indices = logits_indices
-                    # TODO(sarckk): Enable cascade attention for fast prefill
-                    common_prefix_len = 0
-
                 attn_metadata_i = (builder.build(
                     common_prefix_len=common_prefix_len,
                     common_attn_metadata=common_attn_metadata,
                 ))
 
-                if is_fast_prefill:
-                    # Eligible layers need extra metadata for use in the model.
-                    attn_metadata_i = \
-                        create_kv_sharing_fast_prefill_attn_metadata_subclass(
-                            attn_metadata_i,
-                            logits_indices_padded,
-                            logits_indices.size(0),
-                        )
-
                 for layer_name in attn_group.layer_names:
                     attn_metadata[layer_name] = attn_metadata_i
 
@@ -2577,6 +2556,7 @@ def get_attn_backends_for_layers(
                         "FastPrefill",
                         attn_backend,
                         make_kv_sharing_fast_prefill_common_attn_metadata,
+                        create_kv_sharing_fast_prefill_attn_metadata_subclass,
                     )
 
                 key = attn_backend.full_cls_name()

Original file line number	Diff line number	Diff line change
`@@ -609,7 +609,6 @@ def prepare_inputs(`
`609`	`609`	`max_query_len=new_query_len_per_req.max().item(),`
`610`	`610`	`block_table_tensor=common_attn_metadata.block_table_tensor,`
`611`	`611`	`slot_mapping=common_attn_metadata.slot_mapping[token_indices],`
`612`		`- logits_indices=common_attn_metadata.logits_indices,`
`613`	`612`	`causal=True,`
`614`	`613`	`)`
`615`	`614`