From 4740f4d362784957cdaeac4f24483c1e7555a5f5 Mon Sep 17 00:00:00 2001 From: adabeyta Date: Fri, 17 Oct 2025 19:05:47 +0000 Subject: [PATCH 1/2] Fix calculate_kv_scales=True bug Signed-off-by: adabeyta --- vllm/v1/attention/backends/utils.py | 2 ++ vllm/v1/worker/gpu_model_runner.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index cb5855548098..b2c7ffd337fe 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -94,6 +94,8 @@ class CommonAttentionMetadata: dcp_local_seq_lens: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" + enable_kv_scales_calculation: bool = False + def slice_query_start_locs( query_start_loc: torch.Tensor, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7c2cb701fd64..30afd976a284 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -249,6 +249,8 @@ def __init__( # This will be overridden in load_model() self.is_multimodal_pruning_enabled = False self.max_model_len = model_config.max_model_len + + self.kv_scales_calculated = False self.dcp_world_size = self.parallel_config.decode_context_parallel_size self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs @@ -1328,6 +1330,12 @@ def _prepare_inputs( kv_cache_group_id ] + # Determine if we need to calculate KV scales on this forward pass. + # Only True on the first pass when calculate_kv_scales is enabled. + enable_kv_scales_calculation = ( + self.cache_config.calculate_kv_scales + and not self.kv_scales_calculated) + common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, @@ -1347,6 +1355,7 @@ def _prepare_inputs( dcp_local_seq_lens=self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None, + enable_kv_scales_calculation=enable_kv_scales_calculation, ) if self.speculative_config and spec_decode_common_attn_metadata is None: @@ -2525,6 +2534,11 @@ def execute_model( **model_kwargs, ) + # Mark KV scales as calculated after the first forward pass + if (self.cache_config.calculate_kv_scales + and not self.kv_scales_calculated): + self.kv_scales_calculated = True + with record_function_or_nullcontext("Postprocess"): if self.use_aux_hidden_state_outputs: # True when EAGLE 3 is used. From c1e72a124bcd1a95181045e67074b2fdf00160c2 Mon Sep 17 00:00:00 2001 From: adabeyta Date: Tue, 21 Oct 2025 04:31:40 +0000 Subject: [PATCH 2/2] Replace metadata-based KV scales flag with direct layer state check Signed-off-by: adabeyta --- vllm/attention/layer.py | 12 ++++-------- vllm/v1/attention/backends/utils.py | 2 -- vllm/v1/worker/gpu_model_runner.py | 28 +++++++--------------------- 3 files changed, 11 insertions(+), 31 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index a028be6ce7f8..17d9879562c4 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -836,17 +836,13 @@ def maybe_calc_kv_scales( layer_name: str, ) -> None: forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - - if isinstance(attn_metadata, dict): - attn_metadata = attn_metadata[layer_name] + self = forward_context.no_compile_layers[layer_name] - if attn_metadata is None or not getattr( - attn_metadata, "enable_kv_scales_calculation", False - ): + # Only calculate if the layer's calculate_kv_scales flag is True + # This flag gets set to False after the first forward pass + if not self.calculate_kv_scales: return - self = forward_context.no_compile_layers[layer_name] self.calc_kv_scales(query, key, value) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index b2c7ffd337fe..cb5855548098 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -94,8 +94,6 @@ class CommonAttentionMetadata: dcp_local_seq_lens: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" - enable_kv_scales_calculation: bool = False - def slice_query_start_locs( query_start_loc: torch.Tensor, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 30afd976a284..f31ba3950505 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -250,7 +250,8 @@ def __init__( self.is_multimodal_pruning_enabled = False self.max_model_len = model_config.max_model_len - self.kv_scales_calculated = False + # Always set to false after the first forward pass + self.calculate_kv_scales = self.cache_config.calculate_kv_scales self.dcp_world_size = self.parallel_config.decode_context_parallel_size self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs @@ -1330,12 +1331,6 @@ def _prepare_inputs( kv_cache_group_id ] - # Determine if we need to calculate KV scales on this forward pass. - # Only True on the first pass when calculate_kv_scales is enabled. - enable_kv_scales_calculation = ( - self.cache_config.calculate_kv_scales - and not self.kv_scales_calculated) - common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, @@ -1355,7 +1350,6 @@ def _prepare_inputs( dcp_local_seq_lens=self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None, - enable_kv_scales_calculation=enable_kv_scales_calculation, ) if self.speculative_config and spec_decode_common_attn_metadata is None: @@ -2500,16 +2494,10 @@ def execute_model( ) # Set cudagraph mode to none if calc_kv_scales is true. - if attn_metadata is not None: - metadata_list = ( - attn_metadata.values() - if isinstance(attn_metadata, dict) - else [attn_metadata] - ) - if any( - getattr(m, "enable_kv_scales_calculation", False) for m in metadata_list - ): - cudagraph_runtime_mode = CUDAGraphMode.NONE + # KV scales calculation involves dynamic operations that are incompatible + # with CUDA graph capture. + if self.calculate_kv_scales: + cudagraph_runtime_mode = CUDAGraphMode.NONE # Run the model. # Use persistent buffers for CUDA graphs. @@ -2535,9 +2523,7 @@ def execute_model( ) # Mark KV scales as calculated after the first forward pass - if (self.cache_config.calculate_kv_scales - and not self.kv_scales_calculated): - self.kv_scales_calculated = True + self.calculate_kv_scales = False with record_function_or_nullcontext("Postprocess"): if self.use_aux_hidden_state_outputs: