From 24ec74a45b23875036f8781fc69fdac6f4c13eb9 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Mon, 21 Apr 2025 22:01:12 +0200
Subject: [PATCH 1/2] nits on any-to-any task

---
 packages/tasks/src/tasks/any-to-any/about.md | 40 +++++++++++++-------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/packages/tasks/src/tasks/any-to-any/about.md b/packages/tasks/src/tasks/any-to-any/about.md
index 3921c2b5f6..fff94fab30 100644
--- a/packages/tasks/src/tasks/any-to-any/about.md
+++ b/packages/tasks/src/tasks/any-to-any/about.md
@@ -6,7 +6,7 @@ Any-to-any models can help embodied agents operate in multi-sensory environments
 
 ### Real-time Accessibility Systems
 
-Vision-language based any-to-any models can be used aid visually impaired people. A real-time on-device any-to-any model can take a real-world video stream from wearable glasses, and describe the scene in audio (e.g., "A person in a red coat is walking toward you") or provide real-time closed captions and environmental sound cues.
+Vision-language based any-to-any models can be used to aid visually impaired people. A real-time on-device any-to-any model can take a real-world video stream from wearable glasses, and describe the scene in audio (e.g., "A person in a red coat is walking toward you"), or provide real-time closed captions and environmental sound cues.
 
 ### Multimodal Content Creation
 
@@ -14,39 +14,53 @@ One can use any-to-any models to generate multimodal content. For example, given
 
 ## Inference
 
-You can infer with any-to-any models using transformers. Below is an example to infer Qwen2.5-Omni-7B model, make sure to check the model you're inferring with.
+You can infer with any-to-any models using transformers. Below is an example that passes a video as part of a chat conversation to the Qwen2.5-Omni-7B model, and retrieves text and audio responses. Make sure to check the model you're inferring with.
 
 ```python
 import soundfile as sf
-from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
+from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
 from qwen_omni_utils import process_mm_info
 
-model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
-
+model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2.5-Omni-7B",
+    torch_dtype="auto",
+    device_map="auto",
+    attn_implementation="flash_attention_2",
+)
 processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
 
 conversation = [
     {
         "role": "system",
-        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
+        "content": [
+            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+        ],
     },
     {
         "role": "user",
         "content": [
             {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
+            {"type": "text", "text": "What can you hear and see in this video?"},
         ],
     },
 ]
 
-USE_AUDIO_IN_VIDEO = True
-
-text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-inputs = inputs.to(model.device).to(model.dtype)
+inputs = processor.apply_chat_template(
+    conversation,
+    load_audio_from_video=True,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    video_fps=2,
+
+    # kwargs to be passed to `Qwen2-5-OmniProcessor`
+    padding=True,
+    use_audio_in_video=True,
+)
 
 # Inference: Generation of the output text and audio
-text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+text_ids, audio = model.generate(**inputs, use_audio_in_video=True)
 
 text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(text)

From b2a947ec31b2769726c4496720d5fbd2e8a691f2 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Mon, 21 Apr 2025 22:02:13 +0200
Subject: [PATCH 2/2] Update packages/tasks/src/tasks/any-to-any/about.md

---
 packages/tasks/src/tasks/any-to-any/about.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/packages/tasks/src/tasks/any-to-any/about.md b/packages/tasks/src/tasks/any-to-any/about.md
index fff94fab30..6e7c42430a 100644
--- a/packages/tasks/src/tasks/any-to-any/about.md
+++ b/packages/tasks/src/tasks/any-to-any/about.md
@@ -19,7 +19,6 @@ You can infer with any-to-any models using transformers. Below is an example tha
 ```python
 import soundfile as sf
 from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
-from qwen_omni_utils import process_mm_info
 
 model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-Omni-7B",