vllm-project
diff --git a/‎vllm/attention/backends/abstract.py
Lines changed: 9 additions & 0 deletions b/‎vllm/attention/backends/abstract.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎vllm/attention/backends/flashinfer.py
Lines changed: 8 additions & 4 deletions b/‎vllm/attention/backends/flashinfer.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎vllm/attention/layer.py
Lines changed: 14 additions & 2 deletions b/‎vllm/attention/layer.py
Lines changed: 14 additions & 2 deletions
@@ -304,6 +304,15 @@ def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
         """
         return False
 
+    def inserted_input_quant_supported(self, dtype: torch.dtype, static: bool,
+                                       group_shape: GroupShape):
+        """
+        Does this attention implementation support inserted input quantization.
+        This is used by the AttnFusionPass to insert input quantization
+        that support it.
+        """
+        return False
+
 
 class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
 
 
@@ -1114,10 +1114,14 @@ def forward(
             assert decode_meta.decode_wrapper._sm_scale == softmax_scale
             # TODO: @pavanimajety Remove this once the switch happens
             # inside flashinfer.
-            if not use_trtllm_attention(
-                    num_decode_tokens, attn_metadata.max_decode_seq_len,
-                    kv_cache_dtype, attn_metadata.num_qo_heads,
-                    attn_metadata.num_kv_heads, attn_metadata.head_dim):
+            if not use_trtllm_attention(attn_metadata.num_qo_heads,
+                                        attn_metadata.num_kv_heads,
+                                        attn_metadata.head_dim,
+                                        window_left,
+                                        num_decode_tokens,
+                                        attn_metadata.max_decode_seq_len,
+                                        kv_cache_dtype,
+                                        is_prefill=False):
                 decode_meta.decode_wrapper.run(
                     decode_query,
                     kv_cache.permute(*stride_order),
 
@@ -126,10 +126,11 @@ def __init__(
         self._q_scale = torch.tensor(1.0, dtype=torch.float32)
         self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
 
-        # We also keep the float32 versions of k/v_scale for attention
+        # We also keep the float32 versions of k/v/o_scale for attention
         # backends that don't support tensors (Flashinfer)
         self._k_scale_float = 1.0
         self._v_scale_float = 1.0
+        self._o_scale_float = 1.0
 
         self.use_mla = use_mla
         self.num_heads = num_heads
@@ -195,6 +196,9 @@ def __init__(
         self.layer_name = prefix
         self.attn_type = attn_type
 
+        self.enabled_fusion = compilation_config.pass_config.enable_attn_fusion
+        self.fused_quant = False
+
         if kv_sharing_target_layer_name is not None:
             validate_kv_sharing_target(
                 prefix,
@@ -273,7 +277,13 @@ def forward(
                                   output=output)
             else:
                 torch.ops.vllm.unified_attention_with_output(
-                    query, key, value, output, self.layer_name)
+                    query,
+                    key,
+                    value,
+                    output,
+                    self.layer_name,
+                    query_scale=(self._q_scale
+                                 if self.enabled_fusion else None))
             return output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
@@ -476,6 +486,7 @@ def unified_attention_with_output(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
+    query_scale: Optional[torch.Tensor] = None,
     output_scale: Optional[torch.Tensor] = None,
 ) -> None:
     wait_for_kv_layer_from_connector(layer_name)
@@ -503,6 +514,7 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
+    query_scale: Optional[torch.Tensor] = None,
     output_scale: Optional[torch.Tensor] = None,
 ) -> None:
     return