@@ -1539,11 +1539,8 @@ def apply_chat_template(
1539
1539
for key in ["image" , "url" , "path" , "base64" ]
1540
1540
if key in vision_info and vision_info ["type" ] == "image"
1541
1541
]
1542
- video_infos = [
1543
- {
1544
- "source" : key ,
1545
- "data" : vision_info [key ],
1546
- }
1542
+ video_fnames = [
1543
+ vision_info [key ]
1547
1544
for vision_info in visuals
1548
1545
for key in ["video" , "url" , "path" ]
1549
1546
if key in vision_info and vision_info ["type" ] == "video"
@@ -1557,40 +1554,40 @@ def apply_chat_template(
1557
1554
for fname in audio_fnames :
1558
1555
batch_audios .append (load_audio (fname , sampling_rate = mm_load_kwargs ["sampling_rate" ]))
1559
1556
else :
1560
- for video_info in video_infos :
1561
- if video_info [ 'source' ] in ["path" , "url" ]:
1557
+ for fname in video_fnames :
1558
+ if fname in ["path" , "url" ]:
1562
1559
# If the video is a file path or URL, we load the audio from the video
1563
1560
# and append it to the batch_audios list
1564
- batch_audios .append (load_audio (video_info ['data' ], sampling_rate = mm_load_kwargs ["sampling_rate" ]))
1565
-
1566
- for video_info in video_infos :
1567
- if video_info ['source' ] == "video" :
1568
- if isinstance (video_info ['data' ], (list , tuple )) and isinstance (video_info ['data' ][0 ], str ):
1569
- # Case 1(a): Video is provided as a list of image file names
1570
- video = [np .array (load_image (image_fname )) for image_fname in video_info ['data' ]]
1571
- video = np .stack (video )
1572
- metadata = None
1573
- logger .warning (
1574
- "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
1575
- "If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
1576
- )
1577
- elif isinstance (video_info ['data' ], dict ):
1578
- # Case 1(b): Video is provided as a dictionary with frames, fps, duration, metadata etc.
1579
- video , metadata = self ._validate_video_content (video_info ['data' ])
1561
+ batch_audios .append (load_audio (fname , sampling_rate = mm_load_kwargs ["sampling_rate" ]))
1580
1562
else :
1581
1563
raise ValueError (
1582
- f"Expected video data to be a list of image file names, a dictionary with 'frames' key, or a string (file path or URL), but got { type (video_info ['data' ])} ."
1583
- )
1584
- else :
1585
- # Case 2: Video is provided as a file path or URL
1586
- if not isinstance (video_info ['data' ], str ):
1587
- raise ValueError (
1588
- f"Expected video data to be a string (file path or URL), but got { type (video_info ['data' ])} ."
1564
+ f"To load audio from video, you must provide video as a file path or URL, but got { fname } ."
1589
1565
)
1566
+
1567
+ for fname in video_fnames :
1568
+ if isinstance (fname , (list , tuple )) and isinstance (fname [0 ], str ):
1569
+ # Case 1(a): Video is provided as a list of image file names
1570
+ video = [np .array (load_image (image_fname )) for image_fname in fname ]
1571
+ video = np .stack (video )
1572
+ metadata = None
1573
+ logger .warning (
1574
+ "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
1575
+ "If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
1576
+ )
1577
+ elif isinstance (fname , dict ):
1578
+ # Case 1(b): Video is provided as a dictionary with frames, fps, duration, metadata etc.
1579
+ video , metadata = self ._validate_video_content (fname )
1580
+ elif isinstance (fname , str ):
1581
+ # Case 1(c): Video is provided as a single file path or URL
1590
1582
video , metadata = load_video (
1591
- video_info [ 'data' ] ,
1583
+ fname ,
1592
1584
backend = mm_load_kwargs ["video_load_backend" ],
1593
1585
)
1586
+ else :
1587
+ raise ValueError (
1588
+ f"Expected video data to be a file path or URL or "
1589
+ f"a list of image file names or a dictionary with 'frames' key and 'metadata' key, but got { type (fname )} ."
1590
+ )
1594
1591
videos .append (video )
1595
1592
video_metadata .append (metadata )
1596
1593
0 commit comments