From 24ec74a45b23875036f8781fc69fdac6f4c13eb9 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Mon, 21 Apr 2025 22:01:12 +0200 Subject: [PATCH 1/2] nits on any-to-any task --- packages/tasks/src/tasks/any-to-any/about.md | 40 +++++++++++++------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/packages/tasks/src/tasks/any-to-any/about.md b/packages/tasks/src/tasks/any-to-any/about.md index 3921c2b5f6..fff94fab30 100644 --- a/packages/tasks/src/tasks/any-to-any/about.md +++ b/packages/tasks/src/tasks/any-to-any/about.md @@ -6,7 +6,7 @@ Any-to-any models can help embodied agents operate in multi-sensory environments ### Real-time Accessibility Systems -Vision-language based any-to-any models can be used aid visually impaired people. A real-time on-device any-to-any model can take a real-world video stream from wearable glasses, and describe the scene in audio (e.g., "A person in a red coat is walking toward you") or provide real-time closed captions and environmental sound cues. +Vision-language based any-to-any models can be used to aid visually impaired people. A real-time on-device any-to-any model can take a real-world video stream from wearable glasses, and describe the scene in audio (e.g., "A person in a red coat is walking toward you"), or provide real-time closed captions and environmental sound cues. ### Multimodal Content Creation @@ -14,39 +14,53 @@ One can use any-to-any models to generate multimodal content. For example, given ## Inference -You can infer with any-to-any models using transformers. Below is an example to infer Qwen2.5-Omni-7B model, make sure to check the model you're inferring with. +You can infer with any-to-any models using transformers. Below is an example that passes a video as part of a chat conversation to the Qwen2.5-Omni-7B model, and retrieves text and audio responses. Make sure to check the model you're inferring with. ```python import soundfile as sf -from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor +from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor from qwen_omni_utils import process_mm_info -model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto") - +model = Qwen2_5OmniForConditionalGeneration.from_pretrained( + "Qwen/Qwen2.5-Omni-7B", + torch_dtype="auto", + device_map="auto", + attn_implementation="flash_attention_2", +) processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B") conversation = [ { "role": "system", - "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", + "content": [ + {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} + ], }, { "role": "user", "content": [ {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"}, + {"type": "text", "text": "What can you hear and see in this video?"}, ], }, ] -USE_AUDIO_IN_VIDEO = True - -text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) -audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO) -inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO) -inputs = inputs.to(model.device).to(model.dtype) +inputs = processor.apply_chat_template( + conversation, + load_audio_from_video=True, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + video_fps=2, + + # kwargs to be passed to `Qwen2-5-OmniProcessor` + padding=True, + use_audio_in_video=True, +) # Inference: Generation of the output text and audio -text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO) +text_ids, audio = model.generate(**inputs, use_audio_in_video=True) text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) print(text) From b2a947ec31b2769726c4496720d5fbd2e8a691f2 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Mon, 21 Apr 2025 22:02:13 +0200 Subject: [PATCH 2/2] Update packages/tasks/src/tasks/any-to-any/about.md --- packages/tasks/src/tasks/any-to-any/about.md | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/tasks/src/tasks/any-to-any/about.md b/packages/tasks/src/tasks/any-to-any/about.md index fff94fab30..6e7c42430a 100644 --- a/packages/tasks/src/tasks/any-to-any/about.md +++ b/packages/tasks/src/tasks/any-to-any/about.md @@ -19,7 +19,6 @@ You can infer with any-to-any models using transformers. Below is an example tha ```python import soundfile as sf from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor -from qwen_omni_utils import process_mm_info model = Qwen2_5OmniForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-Omni-7B",