[GPU] Add IncreasePositionIdsPrecision for Qwen3-VL models (#34716)

andrew-k-park · web-flow · commit e3f68614e268 · 2026-03-16T07:37:11.000Z
### Description of the issue(symptom, root-cause, how it was resolved) - Symptom: Qwen3-VL-4B-Instruct INT4 model produces incorrect output on GPU for long input sequences (>2048 tokens). The 1st token prediction is wrong, causing completely incoherent generated text. CPU output is correct. - Root-cause: The `position_ids` (integer values) are converted to FP16 before the frequency MatMul in the RoPE computation path. FP16 has only 10 mantissa bits, so integers in range 4096–8192 are rounded to the nearest multiple of 4 (e.g., 4173→4172, 4174→4176). This corrupts the sin/cos positional embeddings fed into every transformer layer. The existing `IncreasePositionIdsPrecision` transformation has 4 model-specific patterns but none matches Qwen3-VL because: (1) Unsqueeze is decomposed to Reshape by the frontend, and (2) the path between MatMul and Sin/Cos includes a complex `Gather×3 → ScatterNDUpdate` chain for 3D position assignment (temporal, height, width) that is unique to Qwen3-VL. - Resolution: Added `IncreasePositionIdsPrecisionForQwen3VL` matcher pass that pattern-matches `Convert→Reshape|Unsqueeze→Convert(i32→f16)→MatMul(Broadcast,...)`, then uses forward BFS from MatMul to locate downstream Sin/Cos nodes. The transformation upgrades the position_ids computation path from f16 to f32, and inserts f32→f16 converts after Sin/Cos to restore original precision at the boundary. #### The code and line that caused this issue (if it is not changed directly) - intel_gpu/src/plugin/transformations/increase_position_ids_precision.cpp - `IncreasePositionIdsPrecision::run_on_model()`: the 4 existing sub-passes (ForRoPE, ForQwen25VL, ForLtxVideo, ForGPTOSS) all failed to match the Qwen3-VL graph pattern, so no precision upgrade was applied. #### Reproduction step and snapshot (if applicable. Do not attach for customer model) - python genai/tools/llm_bench/benchmark.py -m Qwen3-VL-4B-Instruct/INT4 -d GPU.1 --task visual_text_gen -pf raw_prompt.jsonl -ic 128 -lc config.json - where config.json = {"ATTENTION_BACKEND": "PA", "CACHE_DIR": ""} - Input: 5545 tokens (tool-calling prompt without image) #### Problematic graph - Qwen3-VL RoPE position_ids path in the language model subgraph: <img width="509" height="1014" alt="image" src="https://github.com/user-attachments/assets/2c5632a0-ad75-440e-b218-4f42f16f9726" /> - The fix changes Convert(i32→f16) to Convert(i32→f32), inserts Convert(f16→f32) after Broadcast, and inserts Convert(f32→f16) after Sin/Cos. #### Checklist - [x] Is it a proper fix? (not a workaround) - [x] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? Which test did you review? - Reviewed `IncreasePositionIdsPrecisionForQwen25VL` test and Added a new dedicated test `IncreasePositionIdsPrecisionForQwen3VL` ### Tickets: - [CVS-182656](https://jira.devtools.intel.com/browse/CVS-182656) Signed-off-by: Andrew Park <andrew.park@intel.com>
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/increase_position_ids_precision.cpp b/src/plugins/intel_gpu/src/plugin/transformations/increase_position_ids_precision.cpp
@@ -4,6 +4,8 @@
 
 #include "increase_position_ids_precision.hpp"
 
+#include <set>
+
 #include "intel_gpu/op/gemm.hpp"
 #include "ov_ops/rotary_positional_embeddings.hpp"
 
@@ -186,6 +188,113 @@ IncreasePositionIdsPrecisionForQwen25VL::IncreasePositionIdsPrecisionForQwen25VL
     this->register_matcher(m, callback);
 }
 
+IncreasePositionIdsPrecisionForQwen3VL::IncreasePositionIdsPrecisionForQwen3VL() {
+    using namespace ov::pass::pattern;
+    using ov::pass::pattern::op::Or;
+
+    // Qwen3-VL RoPE pattern:
+    // position_ids -> Convert(i64->i32) -> Reshape(unsqueeze) -> Convert(i32->f16) -> MatMul(Broadcast, Convert)
+    //   -> Reshape(transpose) -> Gather(select_channel) x3 -> ScatterNDUpdate chain -> Reshape -> Concat(self,self)
+    //   -> Sin/Cos -> Reshape(unsqueeze) -> RoPE
+    //
+    // The intermediate path between MatMul and Sin/Cos is too complex to pattern-match,
+    // so we match the beginning (up to MatMul) and use graph traversal to find downstream Sin/Cos.
+    // Key difference from Qwen2.5-VL: Unsqueeze is decomposed to Reshape.
+    auto position_ids = any_input();
+    auto convert_to_i32 = wrap_type<ov::op::v0::Convert>({position_ids});
+    auto reshape_unsqueeze = wrap_type<ov::op::v1::Reshape>({convert_to_i32, wrap_type<ov::op::v0::Constant>()});
+    auto unsqueeze = wrap_type<ov::op::v0::Unsqueeze>({convert_to_i32, any_input()});
+    auto reshape_or_unsqueeze = std::make_shared<Or>(OutputVector{reshape_unsqueeze, unsqueeze});
+    auto convert_to_f16 = wrap_type<ov::op::v0::Convert>({reshape_or_unsqueeze});
+
+    auto broadcast_freq = wrap_type<ov::op::v3::Broadcast>({any_input(), any_input()});
+    auto matmul = wrap_type<ov::op::v0::MatMul>({broadcast_freq, convert_to_f16});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        auto convert_node = ov::as_type_ptr<ov::op::v0::Convert>(pattern_map.at(convert_to_f16).get_node_shared_ptr());
+        auto broadcast_node = pattern_map.at(broadcast_freq).get_node_shared_ptr();
+        auto matmul_node = ov::as_type_ptr<ov::op::v0::MatMul>(pattern_map.at(matmul).get_node_shared_ptr());
+
+        if (!convert_node || !matmul_node || transformation_callback(convert_node))
+            return false;
+
+        const auto desired_et = ov::element::f32;
+        const auto original_et = convert_node->get_output_element_type(0);
+        if (original_et == desired_et)
+            return false;
+
+        // Verify input is integer type (position_ids should be i32 or i64)
+        auto input_et = convert_node->input_value(0).get_element_type();
+        if (!input_et.is_integral())
+            return false;
+
+        // Walk forward from MatMul to find Sin and Cos nodes through the
+        // Reshape -> Gather -> ScatterNDUpdate -> Reshape -> Concat chain.
+        // Only follow floating-point data outputs to stay on the data path.
+        std::shared_ptr<ov::op::v0::Sin> sin_node;
+        std::shared_ptr<ov::op::v0::Cos> cos_node;
+
+        std::vector<ov::Node*> stack;
+        std::set<ov::Node*> visited;
+        stack.push_back(matmul_node.get());
+        constexpr size_t max_nodes = 30;
+        size_t nodes_visited = 0;
+
+        while (!stack.empty() && nodes_visited < max_nodes && (!sin_node || !cos_node)) {
+            auto* current = stack.back();
+            stack.pop_back();
+
+            for (auto& output : current->outputs()) {
+                if (!output.get_element_type().is_real())
+                    continue;
+                for (auto& target_input : output.get_target_inputs()) {
+                    auto consumer = target_input.get_node()->shared_from_this();
+                    if (!visited.insert(consumer.get()).second)
+                        continue;
+                    nodes_visited++;
+
+                    if (auto sin_ptr = ov::as_type_ptr<ov::op::v0::Sin>(consumer)) {
+                        sin_node = sin_ptr;
+                    } else if (auto cos_ptr = ov::as_type_ptr<ov::op::v0::Cos>(consumer)) {
+                        cos_node = cos_ptr;
+                    } else {
+                        stack.push_back(consumer.get());
+                    }
+                }
+            }
+        }
+
+        if (!sin_node || !cos_node)
+            return false;
+
+        // 1. Change Convert output from f16 to f32 (position_ids path)
+        auto new_convert = std::make_shared<ov::op::v0::Convert>(convert_node->input_value(0), desired_et);
+        new_convert->set_friendly_name(convert_node->get_friendly_name() + "_increase_precision");
+        copy_runtime_info(convert_node, new_convert);
+        ov::replace_node(convert_node, new_convert);
+
+        // 2. Insert Convert(f16->f32) after Broadcast (freq path) to match MatMul types
+        if (broadcast_node->get_output_element_type(0) != desired_et) {
+            auto broadcast_to_f32 = std::make_shared<ov::op::v0::Convert>(broadcast_node->output(0), desired_et);
+            broadcast_to_f32->set_friendly_name(broadcast_node->get_friendly_name() + "_to_f32");
+            copy_runtime_info(broadcast_node, broadcast_to_f32);
+            matmul_node->input(0).replace_source_output(broadcast_to_f32->output(0));
+        }
+
+        // 3. Insert Convert(f32->f16) after Sin/Cos to restore original precision
+        size_t output_idx = 0;
+        insert_converts_after_if_needed(sin_node, original_et, output_idx);
+        insert_converts_after_if_needed(cos_node, original_et, output_idx);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(matmul, "IncreasePositionIdsPrecisionForQwen3VL");
+    this->register_matcher(m, callback);
+}
+
 IncreasePositionIdsPrecisionForLtxVideo::IncreasePositionIdsPrecisionForLtxVideo() {
     using namespace ov::pass::pattern;
     using ov::pass::pattern::op::Or;
@@ -338,6 +447,7 @@ bool IncreasePositionIdsPrecision::run_on_model(const std::shared_ptr<ov::Model>
     auto symbolic_ctx_manager = symbolic_optimizations.get_manager();
     symbolic_ctx_manager->register_pass<IncreasePositionIdsPrecisionForRoPE>();
     symbolic_ctx_manager->register_pass<IncreasePositionIdsPrecisionForQwen25VL>();
+    symbolic_ctx_manager->register_pass<IncreasePositionIdsPrecisionForQwen3VL>();
     symbolic_ctx_manager->register_pass<IncreasePositionIdsPrecisionForLtxVideo>();
     symbolic_ctx_manager->register_pass<IncreasePositionIdsPrecisionForGPTOSS>();
     return symbolic_optimizations.run_on_model(model);
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/increase_position_ids_precision.hpp b/src/plugins/intel_gpu/src/plugin/transformations/increase_position_ids_precision.hpp
@@ -20,6 +20,12 @@ class IncreasePositionIdsPrecisionForQwen25VL : public ov::pass::MatcherPass {
     IncreasePositionIdsPrecisionForQwen25VL();
 };
 
+class IncreasePositionIdsPrecisionForQwen3VL : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("IncreasePositionIdsPrecisionForQwen3VL");
+    IncreasePositionIdsPrecisionForQwen3VL();
+};
+
 class IncreasePositionIdsPrecisionForLtxVideo : public ov::pass::MatcherPass {
 public:
     OPENVINO_MATCHER_PASS_RTTI("IncreasePositionIdsPrecisionForLtxVideo");
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/increase_precision_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/increase_precision_test.cpp
@@ -729,6 +729,105 @@ TEST_F(TransformationTestsF, IncreasePositionIdsPrecisionForQwen25VL) {
     comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
 }
 
+TEST_F(TransformationTestsF, IncreasePositionIdsPrecisionForQwen3VL) {
+    // Qwen3-VL pattern: position_ids -> Convert(i64->i32) -> Reshape(unsqueeze) -> Convert(i32->f16)
+    //   -> MatMul(Broadcast, Convert) -> Reshape(transpose) -> Gather -> Concat(self,self)
+    //   -> Sin/Cos -> Reshape(unsqueeze) -> RoPE
+    {
+        auto position_ids = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{ 3, -1 });
+        auto input_convert = std::make_shared<ov::op::v0::Convert>(position_ids, ov::element::i32);
+        // Qwen3-VL uses Reshape instead of Unsqueeze
+        auto input_reshape = std::make_shared<ov::op::v1::Reshape>(input_convert,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{3, -1, 1, 1}), true);
+        auto convert_2 = std::make_shared<ov::op::v0::Convert>(input_reshape, ov::element::f16);
+
+        auto shape_of = std::make_shared<ov::op::v3::ShapeOf>(input_convert, ov::element::i32);
+        auto gather_0 = std::make_shared<ov::op::v8::Gather>(shape_of,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{1}),
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<int64_t>{0}));
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{1}),
+                gather_0,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{64}),
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{1})}, 0);
+        auto broadcast = std::make_shared<ov::op::v3::Broadcast>(
+            std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1, 64, 1}), concat);
+        auto matmul = std::make_shared<ov::op::v0::MatMul>(broadcast, convert_2);
+
+        // Reshape(transpose) -> Gather(select channel 0)
+        auto reshape_transpose = std::make_shared<ov::op::v1::Reshape>(matmul,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{3, -1, 1, 64}), true);
+        auto gather_ch0 = std::make_shared<ov::op::v8::Gather>(reshape_transpose,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<int64_t>{0}),
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<int64_t>{0}));
+
+        // Concat(self, self) to produce [?, 1, 128]
+        auto concat_2 = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{gather_ch0, gather_ch0}, 2);
+
+        auto cos = std::make_shared<ov::op::v0::Cos>(concat_2);
+        auto sin = std::make_shared<ov::op::v0::Sin>(concat_2);
+        auto cos_unsqueeze = std::make_shared<ov::op::v1::Reshape>(cos,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{-1, 1, 1, 128}), true);
+        auto sin_unsqueeze = std::make_shared<ov::op::v1::Reshape>(sin,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{-1, 1, 1, 128}), true);
+
+        auto input_2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, 8, -1, 128});
+        auto rope = std::make_shared<ov::op::internal::RoPE>(ov::OutputVector{input_2, cos_unsqueeze, sin_unsqueeze},
+                ov::op::internal::RoPE::Config());
+
+        model = std::make_shared<ov::Model>(ov::OutputVector{rope}, ov::ParameterVector{position_ids, input_2});
+        manager.register_pass<IncreasePositionIdsPrecision>();
+    }
+    {
+        auto position_ids = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{ 3, -1 });
+        auto input_convert = std::make_shared<ov::op::v0::Convert>(position_ids, ov::element::i32);
+        auto input_reshape = std::make_shared<ov::op::v1::Reshape>(input_convert,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{3, -1, 1, 1}), true);
+        // Changed: Convert to f32 instead of f16
+        auto convert_2 = std::make_shared<ov::op::v0::Convert>(input_reshape, ov::element::f32);
+
+        auto shape_of = std::make_shared<ov::op::v3::ShapeOf>(input_convert, ov::element::i32);
+        auto gather_0 = std::make_shared<ov::op::v8::Gather>(shape_of,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{1}),
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<int64_t>{0}));
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{1}),
+                gather_0,
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{64}),
+                std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int64_t>{1})}, 0);
+        auto broadcast = std::make_shared<ov::op::v3::Broadcast>(
+            std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1, 64, 1}), concat);
+        // Changed: Insert Convert(f16->f32) after Broadcast
+        auto broadcast_to_f32 = std::make_shared<ov::op::v0::Convert>(broadcast, ov::element::f32);
+        auto matmul = std::make_shared<ov::op::v0::MatMul>(broadcast_to_f32, convert_2);
+
+        auto reshape_transpose = std::make_shared<ov::op::v1::Reshape>(matmul,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{3, -1, 1, 64}), true);
+        auto gather_ch0 = std::make_shared<ov::op::v8::Gather>(reshape_transpose,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<int64_t>{0}),
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<int64_t>{0}));
+
+        auto concat_2 = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{gather_ch0, gather_ch0}, 2);
+
+        auto cos = std::make_shared<ov::op::v0::Cos>(concat_2);
+        auto sin = std::make_shared<ov::op::v0::Sin>(concat_2);
+        // Changed: Insert Convert(f32->f16) after Cos and Sin
+        auto cos_to_f16 = std::make_shared<ov::op::v0::Convert>(cos, ov::element::f16);
+        auto sin_to_f16 = std::make_shared<ov::op::v0::Convert>(sin, ov::element::f16);
+        auto cos_unsqueeze = std::make_shared<ov::op::v1::Reshape>(cos_to_f16,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{-1, 1, 1, 128}), true);
+        auto sin_unsqueeze = std::make_shared<ov::op::v1::Reshape>(sin_to_f16,
+            std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int32_t>{-1, 1, 1, 128}), true);
+
+        auto input_2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, 8, -1, 128});
+        auto rope = std::make_shared<ov::op::internal::RoPE>(ov::OutputVector{input_2, cos_unsqueeze, sin_unsqueeze},
+                ov::op::internal::RoPE::Config());
+
+        model_ref = std::make_shared<ov::Model>(ov::OutputVector{rope}, ov::ParameterVector{position_ids, input_2});
+    }
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}
+
 TEST_F(TransformationTestsF, IncreasePositionIdsPrecisionForGPTOSS) {
     {
         auto position_ids = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{ 3, -1, -1 });