openvinotoolkit
diff --git a/‎.github/dockerfiles/docker_tag‎
Lines changed: 1 addition & 1 deletion b/‎.github/dockerfiles/docker_tag‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/dockerfiles/ov_build/fedora_29/Dockerfile‎
Lines changed: 5 additions & 0 deletions b/‎.github/dockerfiles/ov_build/fedora_29/Dockerfile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/job_pytorch_models_tests.yml‎
Lines changed: 9 additions & 8 deletions b/‎.github/workflows/job_pytorch_models_tests.yml‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎.github/workflows/ubuntu_22.yml‎
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/ubuntu_22.yml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/windows_conditional_compilation.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/windows_conditional_compilation.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/internal/gated-delta-net.rst‎
Lines changed: 151 additions & 0 deletions b/‎docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/internal/gated-delta-net.rst‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py‎
Lines changed: 61 additions & 2 deletions b/‎src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py‎
Lines changed: 61 additions & 2 deletions
diff --git a/‎src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp‎
Lines changed: 40 additions & 0 deletions b/‎src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp‎
Lines changed: 40 additions & 0 deletions
@@ -1 +1 @@
-pr-34333
+pr-34596
@@ -7,6 +7,11 @@ USER root
 RUN echo "timeout=60" >> /etc/dnf/dnf.conf && \
     echo "retries=10" >> /etc/dnf/dnf.conf
 
+# Hackity hack: Fedora 29 is out of support for so long now
+# that we need to steal `ca-certificates` from Rocky Linux 8 repos
+# to trust "storage.openvinotoolkit.org" again
+RUN rpm -ihv --force https://download.rockylinux.org/pub/rocky/8/BaseOS/x86_64/os/Packages/c/ca-certificates-2025.2.80_v9.0.304-80.2.el8_10.noarch.rpm
+
 RUN dnf update -y && dnf install -y \
     git \
     curl \
 
@@ -129,14 +129,6 @@ jobs:
         env:
           TEST_DEVICE: CPU
 
-      - name: RoPE Test
-        if: ${{ inputs.model_scope == 'precommit' }}
-        run: |
-          export PYTHONPATH=${MODEL_HUB_TESTS_INSTALL_DIR}:$PYTHONPATH
-          python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_transformations.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_rope_tests.html --self-contained-html -v --tb=short -n 2
-        env:
-          TEST_DEVICE: CPU
-
       - name: StatefulToStateless Test
         if: ${{ inputs.model_scope == 'precommit' }}
         run: |
@@ -164,6 +156,15 @@ jobs:
         env:
           TEST_DEVICE: CPU
 
+      - name: RoPE Test
+        if: ${{ inputs.model_scope == 'precommit' }}
+        run: |
+          export PYTHONPATH=${MODEL_HUB_TESTS_INSTALL_DIR}:$PYTHONPATH
+          python3 -m pip install -r ${INSTALL_TEST_DIR}/requirements_rope.txt
+          python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_transformations.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_rope_tests.html --self-contained-html -v --tb=short -n 2
+        env:
+          TEST_DEVICE: CPU
+
       - name: Reformat unsupported ops file
         if: ${{ inputs.model_scope != 'precommit' && !cancelled()}}
         run: |
 
@@ -631,6 +631,20 @@ jobs:
     # if: ${{ github.event_name == 'schedule' }}
     if: ${{ 'false' }} # Ticket: 143677
 
+  # Ticket: CVS-182443 — iGPU runners cannot install pip packages from PyPI, need to whitelist pypi.org and download.pytorch.org
+  # iGPU_RoPE_Tests:
+  #   name: iGPU RoPE Tests
+  #   needs: [ Build, Smart_CI, Openvino_tokenizers ]
+  #   uses: ./.github/workflows/job_gpu_tests.yml
+  #   with:
+  #     device: 'igpu'
+  #     test_type: 'rope'
+  #     runner: "[ 'self-hosted', 'igpu' ]"
+  #     runner-group: 'Intel-GPU'
+  #     image: ubuntu:22.04
+  #     options: "--group-add 44 --group-add 993 --device /dev/dri/card1:/dev/dri/card1 --device /dev/dri/renderD128:/dev/dri/renderD128 -e HF_TOKEN"
+  #   if: fromJSON(needs.smart_ci.outputs.affected_components).GPU
+
   Overall_Status:
     name: ci/gha_overall_status
     needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests, Pytorch_Layer_Tests,
 
@@ -360,6 +360,13 @@ jobs:
           should-setup-pip-paths: 'false'
           self-hosted-runner: 'false'
 
+      - name: Install Python's certifi for newer ca-certificates bundle for wget
+        run: |
+          python3 c:/pip_diag.py install certifi
+
+      - name: Set SSL_CERT_FILE for model downloading for unit tests
+        run: echo SSL_CERT_FILE=$(python3 -m certifi) >> $env:GITHUB_ENV
+
       - name: CMake configure - CC ON
         run: |
           cmake `
 
@@ -87,6 +87,7 @@ Operation Specifications
    GRN-1 <operation-specs/normalization/grn-1>
    GRUCell-3 <operation-specs/sequence/gru-cell-3>
    GRUSequence-5 <operation-specs/sequence/gru-sequence-5>
+   GatedDeltaNet <operation-specs/internal/gated-delta-net>
    GatherTree-1 <operation-specs/movement/gather-tree-1>
    Gather-1 <operation-specs/movement/gather-1>
    Gather-7 <operation-specs/movement/gather-7>
 
@@ -0,0 +1,151 @@
+.. {#openvino_docs_ops_internal_GatedDeltaNet}
+
+GatedDeltaNet
+=============
+
+
+.. meta::
+  :description: Learn about GatedDeltaNet - a linear recurrent sequence processing
+                operation based on the delta rule with a gating mechanism.
+
+**Versioned name**: *GatedDeltaNet*
+
+**Category**: *Sequence processing*
+
+**Short description**: *GatedDeltaNet* represents a linear recurrent sequence model
+that combines the delta rule memory update with a gating mechanism.
+
+**Detailed description**: *GatedDeltaNet* implements the recurrence from the paper
+`arXiv:2412.06464 <https://arxiv.org/abs/2412.06464>`__. It processes a sequence of
+query, key, and value vectors using the delta rule to update a hidden state matrix,
+controlled by a per-token forget ``gate`` (applied as ``exp(g)``) and a per-token
+write gate ``beta``. Queries are scaled by ``1 / sqrt(key_head_dim)`` before being used
+to compute the output. The following PyTorch-equivalent code illustrates the full
+computation:
+
+.. code-block:: py
+
+   def torch_recurrent_gated_delta_rule(
+       query, key, value, recurrent_state, gate, beta,
+   ):
+       batch_size, sequence_length, num_heads, k_head_dim = key.shape
+       v_head_dim = value.shape[-1]
+       scale = 1 / (query.shape[-1] ** 0.5)
+       query = query * scale
+
+       output_attn = torch.zeros(batch_size, sequence_length, num_heads, v_head_dim).to(value)
+       output_recurrent_state = recurrent_state
+
+       for i in range(sequence_length):
+           q_t = query[:, i]
+           k_t = key[:, i]
+           v_t = value[:, i]
+           g_t = gate[:, i].exp().unsqueeze(-1).unsqueeze(-1)
+           beta_t = beta[:, i].unsqueeze(-1)
+
+           output_recurrent_state = output_recurrent_state * g_t
+           kv_mem = (output_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
+           delta = (v_t - kv_mem) * beta_t
+           output_recurrent_state = output_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
+           output_attn[:, i] = (output_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
+
+       return output_attn, output_recurrent_state
+
+
+**Inputs**
+
+* **1**: ``query`` - 4D tensor of type *T* and shape ``[batch_size, seq_len, num_heads, key_head_dim]``,
+  the query vectors for each token and head. Scaled internally by ``1 / sqrt(key_head_dim)``
+  before computing the output. **Required.**
+
+* **2**: ``key`` - 4D tensor of type *T* and shape ``[batch_size, seq_len, num_heads, key_head_dim]``,
+  the key vectors for each token and head. **Required.**
+
+* **3**: ``value`` - 4D tensor of type *T* and shape ``[batch_size, seq_len, num_heads, value_head_dim]``,
+  the value vectors for each token and head. **Required.**
+
+* **4**: ``recurrent_state`` - 4D tensor of type *T* and shape
+  ``[batch_size, num_heads, key_head_dim, value_head_dim]``, the recurrent (initially all-zeros) hidden state matrix.  **Required.**
+
+* **5**: ``gate`` - 3D tensor of type *T* and shape ``[batch_size, seq_len, num_heads]``,
+  the forget gate in log-space. Applied as ``exp(g)`` at each time step to decay the
+  hidden state before the delta update. **Required.**
+
+* **6**: ``beta`` - 3D tensor of type *T* and shape ``[batch_size, seq_len, num_heads]``,
+  the write gate controlling how much of the delta correction is applied to the hidden
+  state. **Required.**
+
+
+**Outputs**
+
+* **1**: ``output_attn`` - 4D tensor of type *T* and shape
+  ``[batch_size, seq_len, num_heads, value_head_dim]``, the output vectors at each time step
+  produced by applying the state matrix to the (scaled) query.
+
+* **2**: ``output_recurrent_state`` - 4D tensor of type *T* and shape
+  ``[batch_size, num_heads, key_head_dim, value_head_dim]``, the hidden state matrix
+  after processing the last token in the sequence.
+
+
+**Types**
+
+* *T*: any supported floating-point type.
+
+
+**Example**
+
+.. code-block:: xml
+   :force:
+
+   <layer ... type="GatedDeltaNet" ...>
+       <input>
+           <port id="0"> <!-- `query` -->
+               <dim>1</dim>
+               <dim>16</dim>
+               <dim>8</dim>
+               <dim>64</dim>
+           </port>
+           <port id="1"> <!-- `key` -->
+               <dim>1</dim>
+               <dim>16</dim>
+               <dim>8</dim>
+               <dim>64</dim>
+           </port>
+           <port id="2"> <!-- `value` -->
+               <dim>1</dim>
+               <dim>16</dim>
+               <dim>8</dim>
+               <dim>128</dim>
+           </port>
+           <port id="3"> <!-- `recurrent_state` -->
+               <dim>1</dim>
+               <dim>8</dim>
+               <dim>64</dim>
+               <dim>128</dim>
+           </port>
+           <port id="4"> <!-- `gate` -->
+               <dim>1</dim>
+               <dim>16</dim>
+               <dim>8</dim>
+           </port>
+           <port id="5"> <!-- `beta` -->
+               <dim>1</dim>
+               <dim>16</dim>
+               <dim>8</dim>
+           </port>
+       </input>
+       <output>
+           <port id="6"> <!-- `output_attn` -->
+               <dim>1</dim>
+               <dim>16</dim>
+               <dim>8</dim>
+               <dim>128</dim>
+           </port>
+           <port id="7"> <!-- `output_recurrent_state` -->
+               <dim>1</dim>
+               <dim>8</dim>
+               <dim>64</dim>
+               <dim>128</dim>
+           </port>
+       </output>
+   </layer>
@@ -311,11 +311,69 @@ def get_export_decomposition_list():
 
 
 def ops_to_not_decompose():
-    # list of operations that shouldn't be decomposed
+    # list of operations that shouldn't be decomposed because
+    # OpenVINO frontend handles them directly and more efficiently
     return [
-        torch.ops.aten.col2im.default,
+        # Activation functions - each maps to a single dedicated OV op
+        torch.ops.aten.celu.default,
+        torch.ops.aten.elu_.default,
+        torch.ops.aten.glu.default,
+        torch.ops.aten.hardsigmoid.default,
+        torch.ops.aten.hardswish.default,
+        torch.ops.aten.hardswish_.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.leaky_relu_.default,
+        torch.ops.aten.log_sigmoid_forward.default,
+        torch.ops.aten.mish.default,
+        torch.ops.aten.silu.default,
+        torch.ops.aten.silu_.default,
+        # Normalization
         torch.ops.aten.linear.default,
         torch.ops.aten.rms_norm.default,
+        # Math and reduction ops with dedicated translators
+        torch.ops.aten.all.default,
+        torch.ops.aten.argsort.default,
+        torch.ops.aten.argsort.stable,
+        torch.ops.aten.baddbmm.default,
+        torch.ops.aten.dot.default,
+        torch.ops.aten.logaddexp.default,
+        torch.ops.aten.logsumexp.default,
+        torch.ops.aten.outer.default,
+        torch.ops.aten.rad2deg.default,
+        torch.ops.aten.std.correction,
+        # Spatial and structural ops
+        torch.ops.aten.channel_shuffle.default,
+        torch.ops.aten.col2im.default,
+        torch.ops.aten.pixel_shuffle.default,
+        torch.ops.aten.pixel_unshuffle.default,
+        torch.ops.aten.reflection_pad1d.default,
+        torch.ops.aten.reflection_pad2d.default,
+        torch.ops.aten.reflection_pad3d.default,
+        torch.ops.aten.roll.default,
+        # Index and scatter ops
+        torch.ops.aten.index_add.default,
+        torch.ops.aten.index_add_.default,
+        torch.ops.aten.index_copy.default,
+        torch.ops.aten.index_fill.int_Scalar,
+        torch.ops.aten.index_fill_.int_Scalar,
+        torch.ops.aten.masked_fill.Scalar,
+        torch.ops.aten.masked_fill.Tensor,
+        torch.ops.aten.masked_fill_.Scalar,
+        torch.ops.aten.masked_fill_.Tensor,
+        torch.ops.aten.select_scatter.default,
+        # Tensor creation and manipulation
+        torch.ops.aten.hstack.default,
+        torch.ops.aten.linalg_cross.default,
+        torch.ops.aten.linspace.default,
+        torch.ops.aten.one_hot.default,
+        torch.ops.aten.repeat_interleave.self_int,
+        torch.ops.aten.repeat_interleave.self_Tensor,
+        # Note: aten.take_along_dim.default is not listed here because
+        # the translator doesn't handle the 2-input case (dim=None)
+        torch.ops.aten.tril.default,
+        torch.ops.aten.triu.default,
+        torch.ops.aten.vstack.default,
+        # Upsampling / interpolation
         torch.ops.aten.upsample_nearest1d.default,
         torch.ops.aten.upsample_nearest1d.vec,
         torch.ops.aten.upsample_nearest2d.default,
@@ -326,5 +384,6 @@ def ops_to_not_decompose():
         torch.ops.aten.upsample_bilinear2d.vec,
         torch.ops.aten.upsample_trilinear3d.vec,
         torch.ops.aten.upsample_bicubic2d.vec,
+        # Attention
         torch.ops.aten.scaled_dot_product_attention.default,
     ]
@@ -100,13 +100,13 @@ activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_fact
         if (transformation_callback(scaled_op))
             return false;
 
-        // in the case of decompressed_to_f32 nodes, no need to apply activations scaling
+        // If a decompressed_to_f32 Convert node is present, we need to add scale_up layer after it.
         std::shared_ptr<ov::Node> output_of_scaled_op = scaled_op;
         auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node();
         if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type<v0::Convert>(child_node) &&
             ov::fp16_compression_is_disabled(child_node->shared_from_this()) &&
             constant_folding_is_disabled(child_node->shared_from_this())) {
-            return false;
+            output_of_scaled_op = child_node->shared_from_this();
         }
 
         const std::vector<float> scale_up_value = {scale_factor};
 
@@ -15,13 +15,16 @@
 #include "openvino/op/constant.hpp"
 #include "openvino/op/convolution.hpp"
 #include "openvino/op/group_normalization.hpp"
+#include "openvino/op/matmul.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/mvn.hpp"
 #include "openvino/op/parameter.hpp"
 #include "openvino/op/reshape.hpp"
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/variadic_split.hpp"
 #include "openvino/pass/manager.hpp"
+#include "transformations/common_optimizations/lin_op_sequence_fusion.hpp"
+#include "transformations/common_optimizations/nop_elimination.hpp"
 #include "transformations/utils/utils.hpp"
 
 using namespace ov;
@@ -73,6 +76,43 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) {
     }
 }
 
+TEST_F(TransformationTestsF, ScaleDownSingleLayerTest_f32) {
+    float scale_factor = 128.f;
+    {
+        auto input = std::make_shared<v0::Parameter>(ov::element::f16, ov::PartialShape{1, 16});
+        auto weights_const0 = v0::Constant::create(ov::element::f16, ov::Shape{16, 8}, {1});
+        auto matmul0 = std::make_shared<v0::MatMul>(input, weights_const0);
+        auto weights_const1 = v0::Constant::create(ov::element::f16, ov::Shape{8, 16}, {1});
+        auto matmul1 = std::make_shared<v0::MatMul>(matmul0, weights_const1);
+        auto convert = std::make_shared<v0::Convert>(matmul1, ov::element::f32);
+        disable_fp16_compression(convert);
+        disable_constant_folding(convert);
+        auto convert_f16 = std::make_shared<v0::Convert>(convert, ov::element::f16);
+        auto result = std::make_shared<v0::Result>(convert_f16);
+
+        model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{input});
+        manager.register_pass<ov::pass::activations_scaling::ScaleDownSingleLayer>(scale_factor, ov::element::f16);
+        manager.register_pass<ov::pass::MultiplyMultiplyFusion>();
+        manager.register_pass<ov::pass::EliminateEltwise>();
+    }
+    {
+        auto input = std::make_shared<v0::Parameter>(ov::element::f16, ov::PartialShape{1, 16});
+        auto weights_const0 = v0::Constant::create(ov::element::f16, ov::Shape{16, 8}, {1});
+        auto scale_down_const = v0::Constant::create(ov::element::f16, ov::Shape{}, {1.f / scale_factor});
+        auto scale_down = std::make_shared<v1::Multiply>(input, scale_down_const);
+        auto matmul0 = std::make_shared<v0::MatMul>(scale_down, weights_const0);
+        auto weights_const1 = v0::Constant::create(ov::element::f16, ov::Shape{8, 16}, {1});
+        auto matmul1 = std::make_shared<v0::MatMul>(matmul0, weights_const1);
+        auto convert = std::make_shared<v0::Convert>(matmul1, ov::element::f32);
+        auto scale_up_const = v0::Constant::create(ov::element::f32, ov::Shape{}, {scale_factor});
+        auto scale_up = std::make_shared<v1::Multiply>(convert, scale_up_const);
+        auto convert_f16 = std::make_shared<v0::Convert>(scale_up, ov::element::f16);
+        auto result = std::make_shared<v0::Result>(convert_f16);
+
+        model_ref = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{input});
+    }
+}
+
 TEST_F(TransformationTestsF, EliminateScalarMulTest) {
     double epsilon = 1.f;
     float scale_factor = 8.f;