Skip to content

Commit 72d3f44

Browse files
authored
Merge branch 'master' into mvafin/pt_fe/where_cond
2 parents 26244fe + 4402dee commit 72d3f44

File tree

102 files changed

+4522
-880
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+4522
-880
lines changed

.github/dockerfiles/docker_tag

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
pr-34333
1+
pr-34596

.github/dockerfiles/ov_build/fedora_29/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ USER root
77
RUN echo "timeout=60" >> /etc/dnf/dnf.conf && \
88
echo "retries=10" >> /etc/dnf/dnf.conf
99

10+
# Hackity hack: Fedora 29 is out of support for so long now
11+
# that we need to steal `ca-certificates` from Rocky Linux 8 repos
12+
# to trust "storage.openvinotoolkit.org" again
13+
RUN rpm -ihv --force https://download.rockylinux.org/pub/rocky/8/BaseOS/x86_64/os/Packages/c/ca-certificates-2025.2.80_v9.0.304-80.2.el8_10.noarch.rpm
14+
1015
RUN dnf update -y && dnf install -y \
1116
git \
1217
curl \

.github/workflows/job_pytorch_models_tests.yml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -129,14 +129,6 @@ jobs:
129129
env:
130130
TEST_DEVICE: CPU
131131

132-
- name: RoPE Test
133-
if: ${{ inputs.model_scope == 'precommit' }}
134-
run: |
135-
export PYTHONPATH=${MODEL_HUB_TESTS_INSTALL_DIR}:$PYTHONPATH
136-
python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_transformations.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_rope_tests.html --self-contained-html -v --tb=short -n 2
137-
env:
138-
TEST_DEVICE: CPU
139-
140132
- name: StatefulToStateless Test
141133
if: ${{ inputs.model_scope == 'precommit' }}
142134
run: |
@@ -164,6 +156,15 @@ jobs:
164156
env:
165157
TEST_DEVICE: CPU
166158

159+
- name: RoPE Test
160+
if: ${{ inputs.model_scope == 'precommit' }}
161+
run: |
162+
export PYTHONPATH=${MODEL_HUB_TESTS_INSTALL_DIR}:$PYTHONPATH
163+
python3 -m pip install -r ${INSTALL_TEST_DIR}/requirements_rope.txt
164+
python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_transformations.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_rope_tests.html --self-contained-html -v --tb=short -n 2
165+
env:
166+
TEST_DEVICE: CPU
167+
167168
- name: Reformat unsupported ops file
168169
if: ${{ inputs.model_scope != 'precommit' && !cancelled()}}
169170
run: |

.github/workflows/ubuntu_22.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,20 @@ jobs:
631631
# if: ${{ github.event_name == 'schedule' }}
632632
if: ${{ 'false' }} # Ticket: 143677
633633

634+
# Ticket: CVS-182443 — iGPU runners cannot install pip packages from PyPI, need to whitelist pypi.org and download.pytorch.org
635+
# iGPU_RoPE_Tests:
636+
# name: iGPU RoPE Tests
637+
# needs: [ Build, Smart_CI, Openvino_tokenizers ]
638+
# uses: ./.github/workflows/job_gpu_tests.yml
639+
# with:
640+
# device: 'igpu'
641+
# test_type: 'rope'
642+
# runner: "[ 'self-hosted', 'igpu' ]"
643+
# runner-group: 'Intel-GPU'
644+
# image: ubuntu:22.04
645+
# options: "--group-add 44 --group-add 993 --device /dev/dri/card1:/dev/dri/card1 --device /dev/dri/renderD128:/dev/dri/renderD128 -e HF_TOKEN"
646+
# if: fromJSON(needs.smart_ci.outputs.affected_components).GPU
647+
634648
Overall_Status:
635649
name: ci/gha_overall_status
636650
needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests, Pytorch_Layer_Tests,

.github/workflows/windows_conditional_compilation.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,13 @@ jobs:
360360
should-setup-pip-paths: 'false'
361361
self-hosted-runner: 'false'
362362

363+
- name: Install Python's certifi for newer ca-certificates bundle for wget
364+
run: |
365+
python3 c:/pip_diag.py install certifi
366+
367+
- name: Set SSL_CERT_FILE for model downloading for unit tests
368+
run: echo SSL_CERT_FILE=$(python3 -m certifi) >> $env:GITHUB_ENV
369+
363370
- name: CMake configure - CC ON
364371
run: |
365372
cmake `

docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Operation Specifications
8787
GRN-1 <operation-specs/normalization/grn-1>
8888
GRUCell-3 <operation-specs/sequence/gru-cell-3>
8989
GRUSequence-5 <operation-specs/sequence/gru-sequence-5>
90+
GatedDeltaNet <operation-specs/internal/gated-delta-net>
9091
GatherTree-1 <operation-specs/movement/gather-tree-1>
9192
Gather-1 <operation-specs/movement/gather-1>
9293
Gather-7 <operation-specs/movement/gather-7>
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
.. {#openvino_docs_ops_internal_GatedDeltaNet}
2+
3+
GatedDeltaNet
4+
=============
5+
6+
7+
.. meta::
8+
:description: Learn about GatedDeltaNet - a linear recurrent sequence processing
9+
operation based on the delta rule with a gating mechanism.
10+
11+
**Versioned name**: *GatedDeltaNet*
12+
13+
**Category**: *Sequence processing*
14+
15+
**Short description**: *GatedDeltaNet* represents a linear recurrent sequence model
16+
that combines the delta rule memory update with a gating mechanism.
17+
18+
**Detailed description**: *GatedDeltaNet* implements the recurrence from the paper
19+
`arXiv:2412.06464 <https://arxiv.org/abs/2412.06464>`__. It processes a sequence of
20+
query, key, and value vectors using the delta rule to update a hidden state matrix,
21+
controlled by a per-token forget ``gate`` (applied as ``exp(g)``) and a per-token
22+
write gate ``beta``. Queries are scaled by ``1 / sqrt(key_head_dim)`` before being used
23+
to compute the output. The following PyTorch-equivalent code illustrates the full
24+
computation:
25+
26+
.. code-block:: py
27+
28+
def torch_recurrent_gated_delta_rule(
29+
query, key, value, recurrent_state, gate, beta,
30+
):
31+
batch_size, sequence_length, num_heads, k_head_dim = key.shape
32+
v_head_dim = value.shape[-1]
33+
scale = 1 / (query.shape[-1] ** 0.5)
34+
query = query * scale
35+
36+
output_attn = torch.zeros(batch_size, sequence_length, num_heads, v_head_dim).to(value)
37+
output_recurrent_state = recurrent_state
38+
39+
for i in range(sequence_length):
40+
q_t = query[:, i]
41+
k_t = key[:, i]
42+
v_t = value[:, i]
43+
g_t = gate[:, i].exp().unsqueeze(-1).unsqueeze(-1)
44+
beta_t = beta[:, i].unsqueeze(-1)
45+
46+
output_recurrent_state = output_recurrent_state * g_t
47+
kv_mem = (output_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
48+
delta = (v_t - kv_mem) * beta_t
49+
output_recurrent_state = output_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
50+
output_attn[:, i] = (output_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
51+
52+
return output_attn, output_recurrent_state
53+
54+
55+
**Inputs**
56+
57+
* **1**: ``query`` - 4D tensor of type *T* and shape ``[batch_size, seq_len, num_heads, key_head_dim]``,
58+
the query vectors for each token and head. Scaled internally by ``1 / sqrt(key_head_dim)``
59+
before computing the output. **Required.**
60+
61+
* **2**: ``key`` - 4D tensor of type *T* and shape ``[batch_size, seq_len, num_heads, key_head_dim]``,
62+
the key vectors for each token and head. **Required.**
63+
64+
* **3**: ``value`` - 4D tensor of type *T* and shape ``[batch_size, seq_len, num_heads, value_head_dim]``,
65+
the value vectors for each token and head. **Required.**
66+
67+
* **4**: ``recurrent_state`` - 4D tensor of type *T* and shape
68+
``[batch_size, num_heads, key_head_dim, value_head_dim]``, the recurrent (initially all-zeros) hidden state matrix. **Required.**
69+
70+
* **5**: ``gate`` - 3D tensor of type *T* and shape ``[batch_size, seq_len, num_heads]``,
71+
the forget gate in log-space. Applied as ``exp(g)`` at each time step to decay the
72+
hidden state before the delta update. **Required.**
73+
74+
* **6**: ``beta`` - 3D tensor of type *T* and shape ``[batch_size, seq_len, num_heads]``,
75+
the write gate controlling how much of the delta correction is applied to the hidden
76+
state. **Required.**
77+
78+
79+
**Outputs**
80+
81+
* **1**: ``output_attn`` - 4D tensor of type *T* and shape
82+
``[batch_size, seq_len, num_heads, value_head_dim]``, the output vectors at each time step
83+
produced by applying the state matrix to the (scaled) query.
84+
85+
* **2**: ``output_recurrent_state`` - 4D tensor of type *T* and shape
86+
``[batch_size, num_heads, key_head_dim, value_head_dim]``, the hidden state matrix
87+
after processing the last token in the sequence.
88+
89+
90+
**Types**
91+
92+
* *T*: any supported floating-point type.
93+
94+
95+
**Example**
96+
97+
.. code-block:: xml
98+
:force:
99+
100+
<layer ... type="GatedDeltaNet" ...>
101+
<input>
102+
<port id="0"> <!-- `query` -->
103+
<dim>1</dim>
104+
<dim>16</dim>
105+
<dim>8</dim>
106+
<dim>64</dim>
107+
</port>
108+
<port id="1"> <!-- `key` -->
109+
<dim>1</dim>
110+
<dim>16</dim>
111+
<dim>8</dim>
112+
<dim>64</dim>
113+
</port>
114+
<port id="2"> <!-- `value` -->
115+
<dim>1</dim>
116+
<dim>16</dim>
117+
<dim>8</dim>
118+
<dim>128</dim>
119+
</port>
120+
<port id="3"> <!-- `recurrent_state` -->
121+
<dim>1</dim>
122+
<dim>8</dim>
123+
<dim>64</dim>
124+
<dim>128</dim>
125+
</port>
126+
<port id="4"> <!-- `gate` -->
127+
<dim>1</dim>
128+
<dim>16</dim>
129+
<dim>8</dim>
130+
</port>
131+
<port id="5"> <!-- `beta` -->
132+
<dim>1</dim>
133+
<dim>16</dim>
134+
<dim>8</dim>
135+
</port>
136+
</input>
137+
<output>
138+
<port id="6"> <!-- `output_attn` -->
139+
<dim>1</dim>
140+
<dim>16</dim>
141+
<dim>8</dim>
142+
<dim>128</dim>
143+
</port>
144+
<port id="7"> <!-- `output_recurrent_state` -->
145+
<dim>1</dim>
146+
<dim>8</dim>
147+
<dim>64</dim>
148+
<dim>128</dim>
149+
</port>
150+
</output>
151+
</layer>

src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,11 +311,69 @@ def get_export_decomposition_list():
311311

312312

313313
def ops_to_not_decompose():
314-
# list of operations that shouldn't be decomposed
314+
# list of operations that shouldn't be decomposed because
315+
# OpenVINO frontend handles them directly and more efficiently
315316
return [
316-
torch.ops.aten.col2im.default,
317+
# Activation functions - each maps to a single dedicated OV op
318+
torch.ops.aten.celu.default,
319+
torch.ops.aten.elu_.default,
320+
torch.ops.aten.glu.default,
321+
torch.ops.aten.hardsigmoid.default,
322+
torch.ops.aten.hardswish.default,
323+
torch.ops.aten.hardswish_.default,
324+
torch.ops.aten.hardtanh_.default,
325+
torch.ops.aten.leaky_relu_.default,
326+
torch.ops.aten.log_sigmoid_forward.default,
327+
torch.ops.aten.mish.default,
328+
torch.ops.aten.silu.default,
329+
torch.ops.aten.silu_.default,
330+
# Normalization
317331
torch.ops.aten.linear.default,
318332
torch.ops.aten.rms_norm.default,
333+
# Math and reduction ops with dedicated translators
334+
torch.ops.aten.all.default,
335+
torch.ops.aten.argsort.default,
336+
torch.ops.aten.argsort.stable,
337+
torch.ops.aten.baddbmm.default,
338+
torch.ops.aten.dot.default,
339+
torch.ops.aten.logaddexp.default,
340+
torch.ops.aten.logsumexp.default,
341+
torch.ops.aten.outer.default,
342+
torch.ops.aten.rad2deg.default,
343+
torch.ops.aten.std.correction,
344+
# Spatial and structural ops
345+
torch.ops.aten.channel_shuffle.default,
346+
torch.ops.aten.col2im.default,
347+
torch.ops.aten.pixel_shuffle.default,
348+
torch.ops.aten.pixel_unshuffle.default,
349+
torch.ops.aten.reflection_pad1d.default,
350+
torch.ops.aten.reflection_pad2d.default,
351+
torch.ops.aten.reflection_pad3d.default,
352+
torch.ops.aten.roll.default,
353+
# Index and scatter ops
354+
torch.ops.aten.index_add.default,
355+
torch.ops.aten.index_add_.default,
356+
torch.ops.aten.index_copy.default,
357+
torch.ops.aten.index_fill.int_Scalar,
358+
torch.ops.aten.index_fill_.int_Scalar,
359+
torch.ops.aten.masked_fill.Scalar,
360+
torch.ops.aten.masked_fill.Tensor,
361+
torch.ops.aten.masked_fill_.Scalar,
362+
torch.ops.aten.masked_fill_.Tensor,
363+
torch.ops.aten.select_scatter.default,
364+
# Tensor creation and manipulation
365+
torch.ops.aten.hstack.default,
366+
torch.ops.aten.linalg_cross.default,
367+
torch.ops.aten.linspace.default,
368+
torch.ops.aten.one_hot.default,
369+
torch.ops.aten.repeat_interleave.self_int,
370+
torch.ops.aten.repeat_interleave.self_Tensor,
371+
# Note: aten.take_along_dim.default is not listed here because
372+
# the translator doesn't handle the 2-input case (dim=None)
373+
torch.ops.aten.tril.default,
374+
torch.ops.aten.triu.default,
375+
torch.ops.aten.vstack.default,
376+
# Upsampling / interpolation
319377
torch.ops.aten.upsample_nearest1d.default,
320378
torch.ops.aten.upsample_nearest1d.vec,
321379
torch.ops.aten.upsample_nearest2d.default,
@@ -326,5 +384,6 @@ def ops_to_not_decompose():
326384
torch.ops.aten.upsample_bilinear2d.vec,
327385
torch.ops.aten.upsample_trilinear3d.vec,
328386
torch.ops.aten.upsample_bicubic2d.vec,
387+
# Attention
329388
torch.ops.aten.scaled_dot_product_attention.default,
330389
]

src/common/transformations/src/transformations/common_optimizations/activations_scaling.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,13 @@ activations_scaling::ScaleDownSingleLayer::ScaleDownSingleLayer(float scale_fact
100100
if (transformation_callback(scaled_op))
101101
return false;
102102

103-
// in the case of decompressed_to_f32 nodes, no need to apply activations scaling
103+
// If a decompressed_to_f32 Convert node is present, we need to add scale_up layer after it.
104104
std::shared_ptr<ov::Node> output_of_scaled_op = scaled_op;
105105
auto child_node = scaled_op->get_output_target_inputs(0).begin()->get_node();
106106
if (scaled_op->get_output_target_inputs(0).size() == 1 && ov::is_type<v0::Convert>(child_node) &&
107107
ov::fp16_compression_is_disabled(child_node->shared_from_this()) &&
108108
constant_folding_is_disabled(child_node->shared_from_this())) {
109-
return false;
109+
output_of_scaled_op = child_node->shared_from_this();
110110
}
111111

112112
const std::vector<float> scale_up_value = {scale_factor};

src/common/transformations/tests/common_optimizations/activations_scaling_test.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,16 @@
1515
#include "openvino/op/constant.hpp"
1616
#include "openvino/op/convolution.hpp"
1717
#include "openvino/op/group_normalization.hpp"
18+
#include "openvino/op/matmul.hpp"
1819
#include "openvino/op/multiply.hpp"
1920
#include "openvino/op/mvn.hpp"
2021
#include "openvino/op/parameter.hpp"
2122
#include "openvino/op/reshape.hpp"
2223
#include "openvino/op/shape_of.hpp"
2324
#include "openvino/op/variadic_split.hpp"
2425
#include "openvino/pass/manager.hpp"
26+
#include "transformations/common_optimizations/lin_op_sequence_fusion.hpp"
27+
#include "transformations/common_optimizations/nop_elimination.hpp"
2528
#include "transformations/utils/utils.hpp"
2629

2730
using namespace ov;
@@ -73,6 +76,43 @@ TEST_F(TransformationTestsF, ScaleDownSingleLayerTest) {
7376
}
7477
}
7578

79+
TEST_F(TransformationTestsF, ScaleDownSingleLayerTest_f32) {
80+
float scale_factor = 128.f;
81+
{
82+
auto input = std::make_shared<v0::Parameter>(ov::element::f16, ov::PartialShape{1, 16});
83+
auto weights_const0 = v0::Constant::create(ov::element::f16, ov::Shape{16, 8}, {1});
84+
auto matmul0 = std::make_shared<v0::MatMul>(input, weights_const0);
85+
auto weights_const1 = v0::Constant::create(ov::element::f16, ov::Shape{8, 16}, {1});
86+
auto matmul1 = std::make_shared<v0::MatMul>(matmul0, weights_const1);
87+
auto convert = std::make_shared<v0::Convert>(matmul1, ov::element::f32);
88+
disable_fp16_compression(convert);
89+
disable_constant_folding(convert);
90+
auto convert_f16 = std::make_shared<v0::Convert>(convert, ov::element::f16);
91+
auto result = std::make_shared<v0::Result>(convert_f16);
92+
93+
model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{input});
94+
manager.register_pass<ov::pass::activations_scaling::ScaleDownSingleLayer>(scale_factor, ov::element::f16);
95+
manager.register_pass<ov::pass::MultiplyMultiplyFusion>();
96+
manager.register_pass<ov::pass::EliminateEltwise>();
97+
}
98+
{
99+
auto input = std::make_shared<v0::Parameter>(ov::element::f16, ov::PartialShape{1, 16});
100+
auto weights_const0 = v0::Constant::create(ov::element::f16, ov::Shape{16, 8}, {1});
101+
auto scale_down_const = v0::Constant::create(ov::element::f16, ov::Shape{}, {1.f / scale_factor});
102+
auto scale_down = std::make_shared<v1::Multiply>(input, scale_down_const);
103+
auto matmul0 = std::make_shared<v0::MatMul>(scale_down, weights_const0);
104+
auto weights_const1 = v0::Constant::create(ov::element::f16, ov::Shape{8, 16}, {1});
105+
auto matmul1 = std::make_shared<v0::MatMul>(matmul0, weights_const1);
106+
auto convert = std::make_shared<v0::Convert>(matmul1, ov::element::f32);
107+
auto scale_up_const = v0::Constant::create(ov::element::f32, ov::Shape{}, {scale_factor});
108+
auto scale_up = std::make_shared<v1::Multiply>(convert, scale_up_const);
109+
auto convert_f16 = std::make_shared<v0::Convert>(scale_up, ov::element::f16);
110+
auto result = std::make_shared<v0::Result>(convert_f16);
111+
112+
model_ref = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{input});
113+
}
114+
}
115+
76116
TEST_F(TransformationTestsF, EliminateScalarMulTest) {
77117
double epsilon = 1.f;
78118
float scale_factor = 8.f;

0 commit comments

Comments
 (0)