Move fast prefill layer init to initialize_kv_cache

sarckk · sarckk · commit 6464c1558655 · 2025-08-19T16:53:48.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -3201,6 +3201,8 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        self.maybe_add_kv_sharing_fast_prefill_layers(attn_layers)
         self.kv_cache_config = kv_cache_config
         self.may_reinitialize_input_batch(kv_cache_config)
         self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
@@ -3323,8 +3325,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                     page_size_padded=page_size_padded,
                     mamba_type=mamba_module.mamba_type)
 
-        self.maybe_add_kv_sharing_fast_prefill_layers(attn_layers)
-
         return kv_cache_spec
 
     def _build_encoder_only_attn_metadata(