quic · quic-amitraj · Jul 10, 2025 · Jul 16, 2025 · Jul 17, 2025 · Jul 23, 2025
@@ -50,6 +50,7 @@ def check_qaic_sdk():
         QEFFCommonLoader,
     )
     from QEfficient.compile.compile_helper import compile
+    from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEFFFluxPipeline
     from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
     from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
     from QEfficient.peft import QEffAutoPeftModelForCausalLM
@@ -70,6 +71,7 @@ def check_qaic_sdk():
         "QEFFAutoModelForImageTextToText",
         "QEFFAutoModelForSpeechSeq2Seq",
         "QEFFCommonLoader",
+        "QEFFFluxPipeline",
     ]
 
 else:

@@ -8,7 +8,6 @@
 import gc
 import inspect
 import logging
-import re
 import shutil
 import subprocess
 import warnings
@@ -19,23 +18,20 @@
 import onnx
 import torch
 
-from QEfficient.base.onnx_transforms import CustomOpTransform, OnnxTransform, RenameFunctionOutputsTransform
+from QEfficient.base.onnx_transforms import OnnxTransform
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.transformers.cache_utils import InvalidIndexProvider
-from QEfficient.transformers.models.pytorch_transforms import get_decoder_layer_classes_for_export
 from QEfficient.utils import (
     constants,
     create_json,
     create_model_params,
     dump_qconfig,
-    export_wrapper,
     generate_mdp_partition_config,
     hash_dict_params,
     load_json,
 )
-from QEfficient.utils.torch_patches import apply_torch_patches, undo_torch_patches
+from QEfficient.utils.export_utils import export_wrapper
 
 logger = logging.getLogger(__name__)
 
@@ -121,8 +117,30 @@ def _model_offloaded_check(self) -> None:
             raise RuntimeError(error_msg)
 
     @property
-    @abstractmethod
-    def model_name(self) -> str: ...
+    def model_name(self) -> str:
+        """
+        Get the model class name without QEff/QEFF prefix.
+
+        This property extracts the underlying model's class name and removes
+        any QEff or QEFF prefix that may have been added during wrapping.
+
+        Returns:
+            str: Model class name (e.g., "CLIPTextModel" instead of "QEffCLIPTextModel")
+        """
+        mname = self.model.__class__.__name__
+        if mname.startswith("QEff") or mname.startswith("QEFF"):
+            mname = mname[4:]
+        return mname
+
+    @property
+    def get_model_config(self) -> Dict:
+        """
+        Get the model configuration as a dictionary.
+
+        Returns:
+            Dict: The configuration dictionary of the underlying HuggingFace model
+        """
+        return self.model.config.__dict__
 
     @abstractmethod
     def export(self, export_dir: Optional[str] = None) -> Path:
@@ -183,7 +201,6 @@ def _export(
         onnx_transform_kwargs: Optional[Dict[str, any]] = None,
         export_dir: Optional[str] = None,
         offload_pt_weights: bool = True,
-        use_onnx_subfunctions: bool = False,
     ) -> str:
         """
         Export the PyTorch model to ONNX and apply ONNX transforms
@@ -248,18 +265,8 @@ def _export(
                     input_names.append(param)
 
         try:
-            # Initialize the registry with your custom ops
+            # Export to ONNX
             export_kwargs = {} if export_kwargs is None else export_kwargs
-            if use_onnx_subfunctions:
-                warnings.warn(
-                    "The subfunction feature is experimental. Please note that using compile consecutively with and without subfunction may produce inconsistent results."
-                )
-                apply_torch_patches()
-                InvalidIndexProvider.SUBFUNC_ENABLED = True
-                output_names = [re.sub("_RetainedState", "_InternalRetainedState", s) for s in output_names]
-                export_kwargs["export_modules_as_functions"] = get_decoder_layer_classes_for_export(self.model)
-                self._onnx_transforms.append(RenameFunctionOutputsTransform)
-                self._onnx_transforms.append(CustomOpTransform)
 
             torch.onnx.export(
                 self.model,
@@ -300,12 +307,6 @@ def _export(
         finally:
             shutil.rmtree(tmp_onnx_dir, ignore_errors=True)
 
-        if use_onnx_subfunctions:
-            undo_torch_patches()
-            InvalidIndexProvider.SUBFUNC_ENABLED = False
-            self._onnx_transforms.remove(CustomOpTransform)
-            self._onnx_transforms.remove(RenameFunctionOutputsTransform)
-
         self.onnx_path = onnx_path
         return onnx_path
 

@@ -0,0 +1,95 @@
+
+<div align="center">
+
+
+# **Diffusion Models on Qualcomm Cloud AI 100**
+
+
+<div align="center">
+
+### 🎨 **Experience the Future of AI Image Generation**
+
+* Optimized for Qualcomm Cloud AI 100*
+
+<img src="../../docs/image/girl_laughing.png" alt="Sample Output" width="400">
+
+**Generated with**: `black-forest-labs/FLUX.1-schnell` • `"A girl laughing"` • 4 steps • 0.0 guidance scale •  ⚡
+
+
+
+</div>
+
+
+
+[![Diffusers](https://img.shields.io/badge/Diffusers-0.35.1-orange.svg)](https://github.com/huggingface/diffusers)
+</div>
+
+---
+
+## ✨ Overview
+
+QEfficient Diffusers brings the power of state-of-the-art diffusion models to Qualcomm Cloud AI 100 hardware for text-to-image generation. Built on top of the popular HuggingFace Diffusers library, our optimized pipeline provides seamless inference on Qualcomm Cloud AI 100 hardware.
+
+## 🛠️ Installation
+
+### Prerequisites
+
+Ensure you have Python 3.8+ and the required dependencies:
+
+```bash
+# Create Python virtual environment (Recommended Python 3.10)
+sudo apt install python3.10-venv
+python3.10 -m venv qeff_env
+source qeff_env/bin/activate
+pip install -U pip
+```
+
+### Install QEfficient
+
+```bash
+# Install from GitHub (includes diffusers support)
+pip install git+https://github.com/quic/efficient-transformers
+
+# Or build from source
+git clone https://github.com/quic/efficient-transformers.git
+cd efficient-transformers
+pip install build wheel
+python -m build --wheel --outdir dist
+pip install dist/qefficient-0.0.1.dev0-py3-none-any.whl
+```
+
+---
+
+## 🎯 Supported Models
+- ✅ [`black-forest-labs/FLUX.1-schnell`](https://huggingface.co/black-forest-labs/FLUX.1-schnell)
+
+---
+
+
+## 📚 Examples
+
+Check out our comprehensive examples in the [`examples/diffusers/`](../../examples/diffusers/) directory:
+
+---
+
+## 🤝 Contributing
+
+We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.
+
+
+
+---
+
+## 🙏 Acknowledgments
+
+- **HuggingFace Diffusers**: For the excellent foundation library
+- **Stability AI**: For the amazing Stable Diffusion models  
+---
+
+## 📞 Support
+
+- 📖 **Documentation**: [https://quic.github.io/efficient-transformers/](https://quic.github.io/efficient-transformers/)
+- 🐛 **Issues**: [GitHub Issues](https://github.com/quic/efficient-transformers/issues)
+
+---
+
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
@@ -0,0 +1,73 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import torch
+from diffusers.models.attention import JointTransformerBlock, _chunked_feed_forward
+
+
+class QEffJointTransformerBlock(JointTransformerBlock):
+    def forward(
+        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
+    ):
+        if self.use_dual_attention:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
+                hidden_states, emb=temb
+            )
+        else:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
+        else:
+            norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+                encoder_hidden_states, emb=temb
+            )
+
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+        )
+
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+
+        if self.use_dual_attention:
+            attn_output2 = self.attn2(hidden_states=norm_hidden_states2)
+            attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
+            hidden_states = hidden_states + attn_output2
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states, block_size=4096)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+        hidden_states = hidden_states + ff_output
+
+        # Process attention outputs for the `encoder_hidden_states`.
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+            encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+            norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+            norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            if self._chunk_size is not None:
+                # "feed_forward_chunk_size" can be used to save memory
+                context_ff_output = _chunked_feed_forward(
+                    self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
+                )
+            else:
+                context_ff_output = self.ff_context(norm_encoder_hidden_states, block_size=333)
+            encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+
+        return encoder_hidden_states, hidden_states