Skip to content

Commit 7e0880b

Browse files
committed
currently testing qwen and internvl models with video object fails
1 parent 630b6db commit 7e0880b

File tree

2 files changed

+33
-33
lines changed

2 files changed

+33
-33
lines changed

src/transformers/processing_utils.py

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1539,11 +1539,8 @@ def apply_chat_template(
15391539
for key in ["image", "url", "path", "base64"]
15401540
if key in vision_info and vision_info["type"] == "image"
15411541
]
1542-
video_infos = [
1543-
{
1544-
"source": key,
1545-
"data": vision_info[key],
1546-
}
1542+
video_fnames = [
1543+
vision_info[key]
15471544
for vision_info in visuals
15481545
for key in ["video", "url", "path"]
15491546
if key in vision_info and vision_info["type"] == "video"
@@ -1557,40 +1554,40 @@ def apply_chat_template(
15571554
for fname in audio_fnames:
15581555
batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
15591556
else:
1560-
for video_info in video_infos:
1561-
if video_info['source'] in ["path", "url"]:
1557+
for fname in video_fnames:
1558+
if fname in ["path", "url"]:
15621559
# If the video is a file path or URL, we load the audio from the video
15631560
# and append it to the batch_audios list
1564-
batch_audios.append(load_audio(video_info['data'], sampling_rate=mm_load_kwargs["sampling_rate"]))
1565-
1566-
for video_info in video_infos:
1567-
if video_info['source'] == "video":
1568-
if isinstance(video_info['data'], (list, tuple)) and isinstance(video_info['data'][0], str):
1569-
# Case 1(a): Video is provided as a list of image file names
1570-
video = [np.array(load_image(image_fname)) for image_fname in video_info['data']]
1571-
video = np.stack(video)
1572-
metadata = None
1573-
logger.warning(
1574-
"When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
1575-
"If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
1576-
)
1577-
elif isinstance(video_info['data'], dict):
1578-
# Case 1(b): Video is provided as a dictionary with frames, fps, duration, metadata etc.
1579-
video, metadata = self._validate_video_content(video_info['data'])
1561+
batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
15801562
else:
15811563
raise ValueError(
1582-
f"Expected video data to be a list of image file names, a dictionary with 'frames' key, or a string (file path or URL), but got {type(video_info['data'])}."
1583-
)
1584-
else:
1585-
# Case 2: Video is provided as a file path or URL
1586-
if not isinstance(video_info['data'], str):
1587-
raise ValueError(
1588-
f"Expected video data to be a string (file path or URL), but got {type(video_info['data'])}."
1564+
f"To load audio from video, you must provide video as a file path or URL, but got {fname}."
15891565
)
1566+
1567+
for fname in video_fnames:
1568+
if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
1569+
# Case 1(a): Video is provided as a list of image file names
1570+
video = [np.array(load_image(image_fname)) for image_fname in fname]
1571+
video = np.stack(video)
1572+
metadata = None
1573+
logger.warning(
1574+
"When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
1575+
"If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
1576+
)
1577+
elif isinstance(fname, dict):
1578+
# Case 1(b): Video is provided as a dictionary with frames, fps, duration, metadata etc.
1579+
video, metadata = self._validate_video_content(fname)
1580+
elif isinstance(fname, str):
1581+
# Case 1(c): Video is provided as a single file path or URL
15901582
video, metadata = load_video(
1591-
video_info['data'],
1583+
fname,
15921584
backend=mm_load_kwargs["video_load_backend"],
15931585
)
1586+
else:
1587+
raise ValueError(
1588+
f"Expected video data to be a file path or URL or "
1589+
f"a list of image file names or a dictionary with 'frames' key and 'metadata' key, but got {type(fname)}."
1590+
)
15941591
videos.append(video)
15951592
video_metadata.append(metadata)
15961593

tests/test_processing_common.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
require_vision,
3535
)
3636
from transformers.utils import is_torch_available, is_vision_available
37-
37+
from transformers.video_utils import load_video
3838

3939
global_rng = random.Random()
4040

@@ -44,6 +44,8 @@
4444
if is_torch_available():
4545
import torch
4646

47+
# load a video file in memory for testing
48+
frames, metadata = load_video("https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4")
4749

4850
MODALITY_INPUT_DATA = {
4951
"images": [
@@ -53,6 +55,7 @@
5355
"videos": [
5456
"https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
5557
["https://www.ilankelman.org/stopsigns/australia.jpg", "https://www.ilankelman.org/stopsigns/australia.jpg"],
58+
{'frames': frames, 'metadata': metadata},
5659
],
5760
"audio": [
5861
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
@@ -931,7 +934,7 @@ def test_apply_chat_template_audio(self, batch_size: int, return_tensors: str):
931934
)
932935

933936
@require_av
934-
@parameterized.expand([(1, "pt"), (2, "pt")]) # video processor supports only torchvision
937+
@parameterized.expand([(1, "pt"), (2, "pt"), (3, "pt")]) # video processor supports only torchvision
935938
def test_apply_chat_template_video(self, batch_size: int, return_tensors: str):
936939
self._test_apply_chat_template(
937940
"video", batch_size, return_tensors, "videos_input_name", "video_processor", MODALITY_INPUT_DATA["videos"]

0 commit comments

Comments
 (0)