Fix rebase

sarckk · sarckk · commit 271f14cd99a4 · 2025-08-19T12:59:26.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
@@ -40,15 +40,15 @@ def __init__(self,
                                                        kv_cache_dtype,
                                                        block_size)
 
-            prefix = \
+            backend_prefix = \
                 f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
 
             def build_preprocess_fn(cm: CommonAttentionMetadata):
                 return make_local_attention_virtual_batches(
                     attention_chunk_size, cm, block_size)
 
             attn_backend = create_custom_attention_backend(
-                prefix, underlying_attn_backend, build_preprocess_fn)
+                backend_prefix, underlying_attn_backend, build_preprocess_fn)
         else:
             # in v0 the local attention is handled inside the backends
             attn_backend = None
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -3683,6 +3683,12 @@ def __post_init__(self):
                     # local attention.
                     self.scheduler_config.disable_hybrid_kv_cache_manager = True
 
+            if self.cache_config.kv_sharing_fast_prefill:
+                # There is an IMA issue currently when using fast prefill with
+                # hybrid kv cache manager (e.g. interleaved sliding window)
+                # TODO(sarckk): investigate and fix
+                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+
     def update_sizes_for_sequence_parallelism(self,
                                               possible_sizes: list) -> list:
         # remove the sizes that not multiple of tp_size when
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
@@ -151,7 +151,7 @@ def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
-    
+
     def _verify_kv_sharing_fast_prefill(self) -> None:
         if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
             raise NotImplementedError(
@@ -169,11 +169,6 @@ def _verify_args(self) -> Self:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
-        if self.kv_sharing_fast_prefill:
-            logger.warning_once(
-                "--kv-sharing-fast-prefill is currently work in progress "
-                "and not functional yet (i.e. no prefill savings)")
-
         return self
 
     def _verify_cache_dtype(self) -> None:
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
@@ -566,7 +566,7 @@ def __init__(
         self.decoder_layers = decoder_layers
         self.layer_idx_start = layer_idx_start
         self.per_layer_model_projection = per_layer_model_projection
-        self.config = vllm_config.model_config.hf_config.text_config
+        self.config = vllm_config.model_config.hf_config
         self.embed_scale_per_layer = embed_scale_per_layer
         self.embed_tokens_per_layer = embed_tokens_per_layer
         self.per_layer_projection_norm = per_layer_projection_norm
@@ -590,13 +590,9 @@ def get_per_layer_input_embeddings(
 
     def get_per_layer_inputs(
         self,
-        input_ids: torch.Tensor,
         hidden_states_0: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        per_layer_inputs = self.get_per_layer_input_embeddings(input_ids)
-        per_layer_inputs = per_layer_inputs.reshape(
-            -1, self.config.num_hidden_layers,
-            self.config.hidden_size_per_layer_input)
         per_layer_projection = self.per_layer_model_projection(hidden_states_0)
         per_layer_projection = per_layer_projection.reshape(
             *hidden_states_0.shape[:-1],
@@ -605,8 +601,12 @@ def get_per_layer_inputs(
         )
         per_layer_projection = self.per_layer_projection_norm(
             per_layer_projection)
-        per_layer_inputs = per_layer_projection + per_layer_inputs
-        per_layer_inputs *= self.per_layer_input_scale
+        if per_layer_inputs is not None:
+            # Profiling run does not compute per_layer_inputs
+            per_layer_inputs = per_layer_projection + per_layer_inputs
+            per_layer_inputs *= self.per_layer_input_scale
+        else:
+            per_layer_inputs = per_layer_projection
         return per_layer_inputs
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -632,15 +632,16 @@ def forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is not None:
             hidden_states_0 = inputs_embeds
         else:
             hidden_states_0 = self.get_input_embeddings(input_ids)
 
-        per_layer_inputs = self.get_per_layer_inputs(input_ids,
-                                                     hidden_states_0)
+        adjusted_per_layer_inputs = self.get_per_layer_inputs(
+            hidden_states_0, per_layer_inputs)
         hidden_states = self.altup_embed(hidden_states_0)
 
         # [altnum_inputs, num_tokens, hidden_size]
@@ -652,14 +653,14 @@ def forward(
             hidden_states = layer(
                 positions=positions,
                 hidden_states=hidden_states,
-                per_layer_input=per_layer_inputs[:, layer_idx, :],
+                per_layer_input=adjusted_per_layer_inputs[:, layer_idx, :],
                 **kwargs,
             )
 
         # [num_tokens, hidden_size, altnum_inputs]
         hidden_states = hidden_states.permute(1, 2, 0)
 
-        return hidden_states, per_layer_inputs
+        return hidden_states, adjusted_per_layer_inputs
 
 
 # This enables torch.compile if --kv-sharing-fast-prefill passed
@@ -853,6 +854,7 @@ def fast_prefill_forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
         logits_indices_padded, num_logits_indices = None, None
@@ -873,13 +875,14 @@ def fast_prefill_forward(
         # Copy inputs for cudagraph
         batch_size = positions.size(0)
         self.positions[:batch_size].copy_(positions)
-        # input_ids and inputs_embeds are allocated in model runner
-        self_decoder_hidden_states, per_layer_inputs = self.self_decoder(
-            input_ids=input_ids,
-            positions=self.positions[:batch_size],
-            inputs_embeds=inputs_embeds,
-            **kwargs,
-        )
+        self_decoder_hidden_states, per_layer_inputs_adjusted = \
+            self.self_decoder(
+                input_ids=input_ids,
+                positions=self.positions[:batch_size],
+                inputs_embeds=inputs_embeds,
+                per_layer_inputs=per_layer_inputs,
+                **kwargs,
+            )
 
         if logits_indices_padded is None:
             logits_indices_padded = torch.arange(
@@ -903,7 +906,7 @@ def fast_prefill_forward(
         self.hidden_states[:num_padded_logits_indices].copy_(
             self_decoder_hidden_states[logits_indices_padded])
         self.per_layer_inputs[:num_padded_logits_indices].copy_(
-            per_layer_inputs[logits_indices_padded])
+            per_layer_inputs_adjusted[logits_indices_padded])
         cross_decoder_hidden_states = self.cross_decoder(
             positions=self.positions[:num_padded_logits_indices],
             hidden_states=self.hidden_states[:num_padded_logits_indices],
@@ -926,12 +929,14 @@ def normal_forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
         hidden_states, per_layer_inputs = self.self_decoder(
             input_ids=input_ids,
             positions=positions,
             inputs_embeds=inputs_embeds,
+            per_layer_inputs=per_layer_inputs,
             **kwargs,
         )
         hidden_states = self.cross_decoder(
@@ -966,25 +971,25 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        # Per layer inputs.
-        if input_ids is None:
-            raise ValueError("Passing None for input ids is not supported.")
-
         if self.fast_prefill_enabled:
             hidden_states = self.fast_prefill_forward(
                 input_ids,
                 positions,
                 inputs_embeds,
+                per_layer_inputs,
                 **kwargs,
             )
         else:
             hidden_states = self.normal_forward(
                 input_ids,
                 positions,
                 inputs_embeds,
+                per_layer_inputs,
                 **kwargs,
             )
         hidden_states = self.altup_unembed(hidden_states)
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
@@ -624,7 +624,7 @@ def get_input_embeddings(
         # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
         # them here, as the model  forward has only access to the input_embeds.
         if input_ids is not None:
-            per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
+            per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings(
                 input_ids)
             per_layer_inputs = per_layer_inputs.reshape(
                 -1, self.config.text_config.num_hidden_layers,