Merge remote-tracking branch 'upstream/main' into mvafin/bitnet_support

mvafin · mvafin · commit 841f3d43f598 · 2025-08-20T11:22:33.000+02:00
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
@@ -8,6 +8,10 @@ on:
       - doc-builder*
       - v*-release
 
+env:
+  UV_SYSTEM_PYTHON: 1
+  UV_TORCH_BACKEND: auto
+
 jobs:
   build_documentation:
     runs-on: ubuntu-22.04
@@ -21,13 +25,13 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-node@v4
         with:
-          node-version: '18'
+          node-version: "18"
           cache-dependency-path: "kit/package-lock.json"
 
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.11'
+          python-version: "3.11"
 
       - name: Set environment variables
         run: |
@@ -45,11 +49,9 @@ jobs:
 
       - name: Setup environment
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install --upgrade setuptools
-          python -m pip install git+https://github.com/huggingface/doc-builder
-          python -m pip install .[quality]
-          python -m pip install openvino nncf neural-compressor[pt] diffusers accelerate
+          pip install --upgrade pip uv
+          uv pip install git+https://github.com/huggingface/doc-builder
+          uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate
 
       - name: Make documentation
         shell: bash
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -9,10 +9,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
+env:
+  UV_SYSTEM_PYTHON: 1
+  UV_TORCH_BACKEND: auto
+
 jobs:
   build_documentation:
     runs-on: ubuntu-22.04
-
     env:
       COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
       PR_NUMBER: ${{ github.event.number }}
@@ -21,42 +24,34 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
         with:
-          repository: "huggingface/doc-builder"
-          path: doc-builder
+          node-version: "18"
+          cache-dependency-path: "kit/package-lock.json"
 
-      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
         with:
-          repository: "huggingface/optimum-intel"
-          path: optimum-intel
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
+          python-version: "3.11"
 
       - name: Setup environment
         run: |
-          pip install --upgrade pip
-          pip uninstall -y doc-builder
-          cd doc-builder
-          git pull origin main
-          pip install .
-          pip install black
-          cd ..
+          pip install --upgrade pip uv
+          uv pip install git+https://github.com/huggingface/doc-builder
+          uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate
 
       - name: Make documentation
+        shell: bash
         run: |
-          cd optimum-intel
-          make doc BUILD_DIR=intel-doc-build VERSION=pr_$PR_NUMBER COMMIT_SHA_SUBPACKAGE=$COMMIT_SHA CLONE_URL=$PR_CLONE_URL
-          cd ..
-
-      - name: Save commit_sha & pr_number
-        run: |
-          cd optimum-intel
-          sudo chmod -R ugo+rwx intel-doc-build
+          doc-builder build optimum.intel docs/source/ \
+            --repo_name optimum-intel \
+            --build_dir intel-doc-build/ \
+            --version pr_${{ env.PR_NUMBER }} \
+            --version_tag_suffix "" \
+            --html \
+            --clean
           cd intel-doc-build
-          sudo mv optimum.intel optimum-intel
+          mv optimum.intel optimum-intel
           echo ${{ env.COMMIT_SHA }} > ./commit_sha
           echo ${{ env.PR_NUMBER }} > ./pr_number
 
diff --git a/Makefile b/Makefile
@@ -59,3 +59,12 @@ doc: build_doc_docker_image
 		--version_tag_suffix "" \
 		--html \
 		--clean
+
+clean:
+	rm -rf build
+	rm -rf dist
+	rm -rf .pytest_cache
+	rm -rf .ruff_cache
+	rm -rf .mypy_cache
+	rm -rf optimum_intel.egg-info
+	rm -rf *__pycache__
diff --git a/docs/Dockerfile b/docs/Dockerfile
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -164,7 +164,6 @@ def init_model_configs():
         "transformers",
         "AutoModelForImageTextToText",
     )
-
     TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = (
         "transformers",
         "AutoModelForVision2Seq",
@@ -300,21 +299,46 @@ def patch_model_for_export(
         return Qwen2MoEPatcher(self, model, model_kwargs=model_kwargs)
 
 
-@register_in_tasks_manager("qwen3", *["text-generation", "text-generation-with-past"], library_name="transformers")
+@register_in_tasks_manager(
+    "qwen3",
+    *[
+        "text-generation",
+        "text-generation-with-past",
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
 class Qwen3OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     MIN_TRANSFORMERS_VERSION = "4.51.0"
 
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self.task in ["feature-extraction"]:
+            common_inputs = {
+                "input_ids": {0: "batch_size", 1: "sequence_length"},
+                "attention_mask": {0: "batch_size", 1: "sequence_length"},
+            }
+        else:
+            common_inputs = super().inputs
+        return common_inputs
+
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
-@register_in_tasks_manager("qwen3_moe", *["text-generation", "text-generation-with-past"], library_name="transformers")
+@register_in_tasks_manager(
+    "qwen3_moe",
+    *["text-generation", "text-generation-with-past", "feature-extraction", "feature-extraction-with-past"],
+    library_name="transformers",
+)
 class Qwen3MoEOpenVINOConfig(Qwen3OpenVINOConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
@@ -3501,7 +3525,11 @@ class Qwen2VLConfigBehavior(str, enum.Enum):
     TEXT_EMBEDDINGS = "text_embeddings"
 
 
-@register_in_tasks_manager("qwen2_vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
+@register_in_tasks_manager(
+    "qwen2_vl",
+    *["image-text-to-text", "video-text-to-text"],
+    library_name="transformers",
+)
 class Qwen2VLOpenVINOConfig(BaseVLMOpenVINOConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior]
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
@@ -3634,7 +3662,11 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return {}
 
 
-@register_in_tasks_manager("qwen2_5_vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers")
+@register_in_tasks_manager(
+    "qwen2_5_vl",
+    *["image-text-to-text", "video-text-to-text"],
+    library_name="transformers",
+)
 class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.49.0")
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -356,14 +356,10 @@ def __enter__(self):
             # Although I'm not sure this is the right way to handle this, we are basically pretending that -65,504 is -inf
             ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask_without_vmap)
 
-            # for non-stateful decoder models, we use eager mask without vmap for sdpa as well
-            # to avoid a nan output issue in OpenVINO that only happens in case of non-stateful models
-            if not getattr(self.real_config, "stateful", False):
-                logger.warning(
-                    "Exporting a non-stateful decoder model currently results in a nan output in OpenVINO. "
-                    "There might be a performance impact due to the use of eager mask (floats) instead of sdpa mask (bools). "
-                )
-                ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap)
+            # for decoder models, we use eager mask without vmap for sdpa as well
+            # to avoid a nan output issue in OpenVINO that only happens in case of:
+            # non-stateful models on cpu and stateful models on npu
+            ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -4771,14 +4767,10 @@ def __enter__(self):
             # Although I'm not sure this is the right way to handle this, we are basically pretending that -65,504 is -inf
             ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask_without_vmap)
 
-            # for non-stateful decoder models, we use eager mask without vmap for sdpa as well
-            # to avoid a nan output issue in OpenVINO that only happens in case of non-stateful models
-            if not getattr(self.real_config, "stateful", False):
-                logger.warning(
-                    "Exporting a non-stateful decoder model currently results in a nan output in OpenVINO. "
-                    "There might be a performance impact due to the use of eager mask (floats) instead of sdpa mask (bools). "
-                )
-                ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap)
+            # for decoder models, we use eager mask without vmap for sdpa as well
+            # to avoid a nan output issue in OpenVINO that only happens in case of:
+            # non-stateful models on cpu and stateful models on npu
+            ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -122,6 +122,11 @@
     SanaSprintPipeline = object
 
 
+if is_diffusers_version(">=", "0.35.0"):
+    from diffusers.models.cache_utils import CacheMixin
+else:
+    CacheMixin = object
+
 DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer"
 DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3"
 
@@ -1072,7 +1077,7 @@ def __call__(self, *args, **kwargs):
         return self.auto_model_class.__call__(self, *args, **kwargs)
 
 
-class OVPipelinePart(ConfigMixin):
+class OVPipelinePart(ConfigMixin, CacheMixin):
     config_name: str = CONFIG_NAME
 
     def __init__(
@@ -1161,6 +1166,11 @@ def __call__(self, *args, **kwargs):
     def modules(self):
         return []
 
+    def named_modules(self):
+        # starting from diffusers 0.35.0 some model parts inherit from `CacheMixin` which uses `named_modules` method
+        # to register some hooks for attention caching, we return empty list here since it can't be used with OpenVINO
+        yield from []
+
 
 class OVModelTextEncoder(OVPipelinePart):
     def __init__(self, model: openvino.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""):
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
@@ -98,6 +98,9 @@ class ExportModelTest(unittest.TestCase):
             {"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline, "ltx-video": OVLTXPipeline}
         )
 
+    if is_transformers_version(">=", "4.51"):
+        SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction})
+
     if is_transformers_version(">=", "4.54"):
         SUPPORTED_ARCHITECTURES.update({"ernie4_5": OVModelForCausalLM})
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -146,6 +146,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "mamba": 2,
         "falcon-mamba": 2,
         "ernie4_5": 2,
+        "qwen3": 2,
     }
 
     TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS = {
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -1028,6 +1028,9 @@ class OVModelForFeatureExtractionIntegrationTest(unittest.TestCase):
         "sentence-transformers-bert",
     )
 
+    if is_transformers_version(">=", "4.51.0"):
+        SUPPORTED_ARCHITECTURES += ("qwen3",)
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]

Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,9 @@ class ExportModelTest(unittest.TestCase):`
`98`	`98`	`{"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline, "ltx-video": OVLTXPipeline}`
`99`	`99`	`)`
`100`	`100`
	`101`	`+ if is_transformers_version(">=", "4.51"):`
	`102`	`+ SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction})`
	`103`	`+`
`101`	`104`	`if is_transformers_version(">=", "4.54"):`
`102`	`105`	`SUPPORTED_ARCHITECTURES.update({"ernie4_5": OVModelForCausalLM})`
`103`	`106`
Original file line number	Diff line number	Diff line change
`@@ -146,6 +146,7 @@ class OVCLIExportTestCase(unittest.TestCase):`
`146`	`146`	`"mamba": 2,`
`147`	`147`	`"falcon-mamba": 2,`
`148`	`148`	`"ernie4_5": 2,`
	`149`	`+ "qwen3": 2,`
`149`	`150`	`}`
`150`	`151`
`151`	`152`	`TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS = {`
Original file line number	Diff line number	Diff line change
`@@ -1028,6 +1028,9 @@ class OVModelForFeatureExtractionIntegrationTest(unittest.TestCase):`
`1028`	`1028`	`"sentence-transformers-bert",`
`1029`	`1029`	`)`
`1030`	`1030`
	`1031`	`+ if is_transformers_version(">=", "4.51.0"):`
	`1032`	`+ SUPPORTED_ARCHITECTURES += ("qwen3",)`
	`1033`	`+`
`1031`	`1034`	`@parameterized.expand(SUPPORTED_ARCHITECTURES)`
`1032`	`1035`	`def test_compare_to_transformers(self, model_arch):`
`1033`	`1036`	`model_id = MODEL_NAMES[model_arch]`