Restore is_query_prescaled heuristic and can_move_scale_after_matmul fallback

evkotov · evkotov · commit 63d1013af87d · 2026-03-16T13:57:52.000+01:00
Address PR #34177 review comments: - [HIGH] Restore can_move_scale_after_matmul() size-based heuristic as performance fallback for non-prescaled query cases (e.g. decode S_q=1) - [LOW] Reword comments to not imply SDPAFusion is always involved Three-way scale placement logic: 1. Q pre-scaled (Multiply(Q, scalar_const)) -> scale K^T (precision fix) 2. can_move_scale_after_matmul -> scale after MatMul (perf optimization) 3. Default -> scale Q
diff --git a/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp b/src/common/transformations/src/transformations/op_conversions/scaled_dot_product_attention_decomposition.cpp
@@ -43,6 +43,55 @@ namespace v3 = ov::op::v3;
 namespace v4 = ov::op::v4;
 namespace v8 = ov::op::v8;
 namespace v13 = ov::op::v13;
+namespace {
+
+// Checks if query is Multiply(input, scalar_constant), indicating Q was pre-scaled
+// (common when PyTorch exports symmetric Q/K scaling via scaled_dot_product_attention).
+// When detected, applying SDPA scale to K^T instead of Q preserves the original
+// computation order and minimizes FP rounding divergence across transformer layers.
+bool is_query_prescaled(const ov::Output<ov::Node>& query) {
+    auto mul = ov::as_type_ptr<v1::Multiply>(query.get_node_shared_ptr());
+    if (!mul)
+        return false;
+    for (size_t i = 0; i < 2; ++i) {
+        auto constant = ov::as_type_ptr<v0::Constant>(mul->input_value(i).get_node_shared_ptr());
+        if (constant) {
+            const auto& shape = constant->get_shape();
+            if (ov::shape_size(shape) == 1)
+                return true;
+        }
+    }
+    return false;
+}
+
+bool can_move_scale_after_matmul(const ov::Output<ov::Node>& query,
+                                 const ov::Output<ov::Node>& kT,
+                                 const ov::Output<ov::Node>& scale) {
+    const auto& scale_pshape = scale.get_partial_shape();
+    const auto& query_pshape = query.get_partial_shape();
+    if (scale_pshape.is_dynamic() || query_pshape.is_dynamic()) {
+        return false;
+    }
+
+    // According to the ov SDPA specification, the scale input have to be 1d with 1 element
+    // or scalar.
+    if (ov::shape_size(scale_pshape.to_shape()) != 1) {
+        return false;
+    }
+
+    // using the original implementation to calculate the shapes.
+    // we need to move the scale after MatMul only if the tensor after MatMul is smaller.
+    auto q_scaled = std::make_shared<v1::Multiply>(query, scale);
+    auto scaled_attn = std::make_shared<v0::MatMul>(q_scaled, kT);
+    const auto& scaled_attn_pshape = scaled_attn->output(0).get_partial_shape();
+    if (scaled_attn_pshape.is_static()) {
+        return ov::shape_size(query_pshape.to_shape()) > ov::shape_size(scaled_attn_pshape.to_shape());
+    }
+    return false;
+}
+
+}  // namespace
+
 ov::pass::ScaledDotProductAttentionDecomposition::ScaledDotProductAttentionDecomposition() {
     MATCHER_SCOPE(ScaledDotProductAttentionDecomposition);
     auto pattern_node = ov::pass::pattern::wrap_type<v13::ScaledDotProductAttention>();
@@ -123,10 +172,20 @@ std::shared_ptr<ov::Node> ov::pass::ScaledDotProductAttentionDecomposition::deco
         register_new_node<v0::Concat>(OutputVector{k_dims_before_transpose, k_last_dim, k_next_dim}, 0);
     auto k_transposed = register_new_node<v1::Transpose>(key, transpose_dims);
 
-    // Always apply scale to K^T. SDPAFusion absorbs K-side scale into the SDPA scale
-    // parameter, so restoring it on K^T preserves the original computation order.
-    auto k_scaled = register_new_node<v1::Multiply>(k_transposed, scale);
-    auto scaled_atten = register_new_node<v0::MatMul>(query, k_scaled)->output(0);
+    ov::Output<Node> scaled_atten;
+    if (is_query_prescaled(query)) {
+        // Q is already pre-scaled (e.g., Multiply(Q, scalar_constant)).
+        // Apply scale to K^T to preserve the original computation order
+        // and minimize FP rounding divergence across transformer layers.
+        auto k_scaled = register_new_node<v1::Multiply>(k_transposed, scale);
+        scaled_atten = register_new_node<v0::MatMul>(query, k_scaled)->output(0);
+    } else if (can_move_scale_after_matmul(query, k_transposed, scale)) {
+        auto atten = register_new_node<v0::MatMul>(query, k_transposed)->output(0);
+        scaled_atten = register_new_node<v1::Multiply>(atten, scale)->output(0);
+    } else {
+        auto q_scaled = register_new_node<v1::Multiply>(query, scale);
+        scaled_atten = register_new_node<v0::MatMul>(q_scaled, k_transposed)->output(0);
+    }
 
     minus_inf = register_new_node<v1::ConvertLike>(minus_inf, scaled_atten);
 
diff --git a/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp b/src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp
@@ -46,6 +46,8 @@ const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(std::
                                                                            std::shared_ptr<ov::Node> attention_mask,
                                                                            std::shared_ptr<ov::Node> scale,
                                                                            bool casual,
+                                                                           bool scale_on_k = false,
+                                                                           bool scale_after_matmul = false,
                                                                            std::shared_ptr<ov::Node> sinks = nullptr);
 
 TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBasic) {
@@ -132,7 +134,7 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionStaticBroadca
 
     {
         const auto scaled_dot_product_attention =
-            scaled_dot_product_attention_decomposition(query, key, value, attention_mask, scale, casual);
+            scaled_dot_product_attention_decomposition(query, key, value, attention_mask, scale, casual, false, true);
         model_ref = std::make_shared<ov::Model>(OutputVector{scaled_dot_product_attention},
                                                 ParameterVector{query, key, value, attention_mask, scale});
     }
@@ -196,7 +198,7 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecompositionDynamic) {
     }
 }
 
-TEST_F(TransformationTestsF, ScaledDotProductAttentionDecomposition_ScalarScale_MultiplyOnK) {
+TEST_F(TransformationTestsF, ScaledDotProductAttentionDecomposition_ScalarScale_MultiplyAfterMatMul) {
     const PartialShape query_shape{1, 32, 64};
     const PartialShape key_shape{1, 32, 64};
     const PartialShape value_shape{1, 32, 64};
@@ -219,12 +221,13 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecomposition_ScalarScale_
     }
 
     {
-        auto ref = scaled_dot_product_attention_decomposition(query, key, value, attention_mask, scale, casual);
+        auto ref =
+            scaled_dot_product_attention_decomposition(query, key, value, attention_mask, scale, casual, false, true);
         model_ref = std::make_shared<ov::Model>(OutputVector{ref}, ParameterVector{query, key, value, attention_mask});
     }
 }
 
-TEST_F(TransformationTestsF, ScaledDotProductAttentionDecomposition_DynamicScale_MultiplyOnK) {
+TEST_F(TransformationTestsF, ScaledDotProductAttentionDecomposition_DynamicScale_MultiplyBeforeMatMul) {
     const PartialShape query_shape{-1, -1, 64};
     const PartialShape key_shape{-1, -1, 64};
     const PartialShape value_shape{-1, -1, 64};
@@ -259,6 +262,8 @@ const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(std::
                                                                            std::shared_ptr<ov::Node> attention_mask,
                                                                            std::shared_ptr<ov::Node> scale,
                                                                            bool casual,
+                                                                           bool scale_on_k,
+                                                                           bool scale_after_matmul,
                                                                            std::shared_ptr<ov::Node> sinks) {
     const auto q_shape = std::make_shared<v3::ShapeOf>(query, element::i32);
     const auto k_shape = std::make_shared<v3::ShapeOf>(key, element::i32);
@@ -298,8 +303,17 @@ const std::shared_ptr<ov::Node> scaled_dot_product_attention_decomposition(std::
         std::make_shared<v0::Concat>(OutputVector{k_dims_before_transpose, k_last_dim, k_next_dim}, 0);
     const auto k_transposed = std::make_shared<v1::Transpose>(key, transpose_dims);
 
-    const auto k_scaled = std::make_shared<v1::Multiply>(k_transposed, scale);
-    Output<Node> scaled_atten = std::make_shared<v0::MatMul>(query, k_scaled)->output(0);
+    Output<Node> scaled_atten;
+    if (scale_on_k) {
+        const auto k_scaled = std::make_shared<v1::Multiply>(k_transposed, scale);
+        scaled_atten = std::make_shared<v0::MatMul>(query, k_scaled)->output(0);
+    } else if (scale_after_matmul) {
+        const auto atten = std::make_shared<v0::MatMul>(query, k_transposed)->output(0);
+        scaled_atten = std::make_shared<v1::Multiply>(atten, scale);
+    } else {
+        const auto q_scaled = std::make_shared<v1::Multiply>(query, scale);
+        scaled_atten = std::make_shared<v0::MatMul>(q_scaled, k_transposed)->output(0);
+    }
     minus_inf = std::make_shared<v1::ConvertLike>(minus_inf, scaled_atten);
 
     Output<Node> mask;
@@ -388,9 +402,9 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecomposition_PreScaledQue
     }
 
     {
-        // Expected: scale applied to K^T (always, unconditionally)
+        // Expected: scale applied to K^T (Q is pre-scaled)
         auto ref =
-            scaled_dot_product_attention_decomposition(query_prescaled, key, value, attention_mask, sdpa_scale, casual);
+            scaled_dot_product_attention_decomposition(query_prescaled, key, value, attention_mask, sdpa_scale, casual, true);
         model_ref =
             std::make_shared<ov::Model>(OutputVector{ref}, ParameterVector{raw_query, key, value, attention_mask});
     }
@@ -422,7 +436,7 @@ TEST_F(TransformationTestsF, ScaledDotProductAttentionDecomposition_Sinks) {
     }
 
     {
-        auto ref = scaled_dot_product_attention_decomposition(query, key, value, attention_mask, scale, casual, sinks);
+        auto ref = scaled_dot_product_attention_decomposition(query, key, value, attention_mask, scale, casual, false, false, sinks);
         model_ref = std::make_shared<ov::Model>(OutputVector{ref},
                                                 ParameterVector{query, key, value, attention_mask, scale, sinks});
     }