From 4740f4d362784957cdaeac4f24483c1e7555a5f5 Mon Sep 17 00:00:00 2001
From: adabeyta <aabeyta@redhat.com>
Date: Fri, 17 Oct 2025 19:05:47 +0000
Subject: [PATCH 1/2] Fix calculate_kv_scales=True bug

Signed-off-by: adabeyta <aabeyta@redhat.com>
---
 vllm/v1/attention/backends/utils.py |  2 ++
 vllm/v1/worker/gpu_model_runner.py  | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index cb5855548098..b2c7ffd337fe 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -94,6 +94,8 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    enable_kv_scales_calculation: bool = False
+
 
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7c2cb701fd64..30afd976a284 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -249,6 +249,8 @@ def __init__(
         # This will be overridden in load_model()
         self.is_multimodal_pruning_enabled = False
         self.max_model_len = model_config.max_model_len
+
+        self.kv_scales_calculated = False
         self.dcp_world_size = self.parallel_config.decode_context_parallel_size
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -1328,6 +1330,12 @@ def _prepare_inputs(
                     kv_cache_group_id
                 ]
 
+            # Determine if we need to calculate KV scales on this forward pass.
+            # Only True on the first pass when calculate_kv_scales is enabled.
+            enable_kv_scales_calculation = (
+                self.cache_config.calculate_kv_scales
+                and not self.kv_scales_calculated)
+
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
                 query_start_loc_cpu=query_start_loc_cpu,
@@ -1347,6 +1355,7 @@ def _prepare_inputs(
                 dcp_local_seq_lens=self.dcp_local_seq_lens.gpu[:num_reqs]
                 if self.dcp_world_size > 1
                 else None,
+                enable_kv_scales_calculation=enable_kv_scales_calculation,
             )
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:
@@ -2525,6 +2534,11 @@ def execute_model(
                 **model_kwargs,
             )
 
+            # Mark KV scales as calculated after the first forward pass
+            if (self.cache_config.calculate_kv_scales
+                    and not self.kv_scales_calculated):
+                self.kv_scales_calculated = True
+
         with record_function_or_nullcontext("Postprocess"):
             if self.use_aux_hidden_state_outputs:
                 # True when EAGLE 3 is used.

From c1e72a124bcd1a95181045e67074b2fdf00160c2 Mon Sep 17 00:00:00 2001
From: adabeyta <aabeyta@redhat.com>
Date: Tue, 21 Oct 2025 04:31:40 +0000
Subject: [PATCH 2/2] Replace metadata-based KV scales flag with direct layer
 state check

Signed-off-by: adabeyta <aabeyta@redhat.com>
---
 vllm/attention/layer.py             | 12 ++++--------
 vllm/v1/attention/backends/utils.py |  2 --
 vllm/v1/worker/gpu_model_runner.py  | 28 +++++++---------------------
 3 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a028be6ce7f8..17d9879562c4 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -836,17 +836,13 @@ def maybe_calc_kv_scales(
     layer_name: str,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
-    attn_metadata = forward_context.attn_metadata
-
-    if isinstance(attn_metadata, dict):
-        attn_metadata = attn_metadata[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
 
-    if attn_metadata is None or not getattr(
-        attn_metadata, "enable_kv_scales_calculation", False
-    ):
+    # Only calculate if the layer's calculate_kv_scales flag is True
+    # This flag gets set to False after the first forward pass
+    if not self.calculate_kv_scales:
         return
 
-    self = forward_context.no_compile_layers[layer_name]
     self.calc_kv_scales(query, key, value)
 
 
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index b2c7ffd337fe..cb5855548098 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -94,8 +94,6 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
-    enable_kv_scales_calculation: bool = False
-
 
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 30afd976a284..f31ba3950505 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -250,7 +250,8 @@ def __init__(
         self.is_multimodal_pruning_enabled = False
         self.max_model_len = model_config.max_model_len
 
-        self.kv_scales_calculated = False
+        # Always set to false after the first forward pass
+        self.calculate_kv_scales = self.cache_config.calculate_kv_scales
         self.dcp_world_size = self.parallel_config.decode_context_parallel_size
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -1330,12 +1331,6 @@ def _prepare_inputs(
                     kv_cache_group_id
                 ]
 
-            # Determine if we need to calculate KV scales on this forward pass.
-            # Only True on the first pass when calculate_kv_scales is enabled.
-            enable_kv_scales_calculation = (
-                self.cache_config.calculate_kv_scales
-                and not self.kv_scales_calculated)
-
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
                 query_start_loc_cpu=query_start_loc_cpu,
@@ -1355,7 +1350,6 @@ def _prepare_inputs(
                 dcp_local_seq_lens=self.dcp_local_seq_lens.gpu[:num_reqs]
                 if self.dcp_world_size > 1
                 else None,
-                enable_kv_scales_calculation=enable_kv_scales_calculation,
             )
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:
@@ -2500,16 +2494,10 @@ def execute_model(
             )
 
         # Set cudagraph mode to none if calc_kv_scales is true.
-        if attn_metadata is not None:
-            metadata_list = (
-                attn_metadata.values()
-                if isinstance(attn_metadata, dict)
-                else [attn_metadata]
-            )
-            if any(
-                getattr(m, "enable_kv_scales_calculation", False) for m in metadata_list
-            ):
-                cudagraph_runtime_mode = CUDAGraphMode.NONE
+        # KV scales calculation involves dynamic operations that are incompatible
+        # with CUDA graph capture.
+        if self.calculate_kv_scales:
+            cudagraph_runtime_mode = CUDAGraphMode.NONE
 
         # Run the model.
         # Use persistent buffers for CUDA graphs.
@@ -2535,9 +2523,7 @@ def execute_model(
             )
 
             # Mark KV scales as calculated after the first forward pass
-            if (self.cache_config.calculate_kv_scales
-                    and not self.kv_scales_calculated):
-                self.kv_scales_calculated = True
+            self.calculate_kv_scales = False
 
         with record_function_or_nullcontext("Postprocess"):
             if self.use_aux_hidden_state_outputs: