Skip to content

Commit db06f5b

Browse files
committed
Address review comments
1 parent 8719706 commit db06f5b

File tree

4 files changed

+4
-13
lines changed

4 files changed

+4
-13
lines changed

src/transformers/video_utils.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -564,17 +564,11 @@ def sample_indices_fn_func(metadata, **fn_kwargs):
564564
sample_indices_fn = sample_indices_fn_func
565565

566566
if is_valid_image(video) or (isinstance(video, (list, tuple)) and is_valid_image(video[0])):
567+
# Case 1: Video is provided as a 4D numpy array or torch tensor (frames, height, width, channels)
567568
if not is_valid_video(video):
568569
raise ValueError(
569570
f"When passing video as decoded frames, video should be a 4D numpy array or torch tensor, but got {video.ndim} dimensions instead."
570571
)
571-
# Case 1: Video is provided as a 4D numpy array or torch tensor (frames, height, width, channels)
572-
if is_torch_tensor(video):
573-
video = video.numpy() # Convert torch tensor to numpy array
574-
logger.warning(
575-
"When loading the video from list of decoded frames, we cannot infer metadata such as `fps` or `duration`. "
576-
"If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
577-
)
578572
return video, None
579573

580574
if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:

tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -423,8 +423,7 @@ def _test_apply_chat_template(
423423
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
424424

425425
if modality == "video":
426-
# qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
427-
# TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5OmniProcessor
426+
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
428427
expected_video_token_count = 0
429428
for thw in out_dict["video_grid_thw"]:
430429
expected_video_token_count += thw[0] * thw[1] * thw[2]

tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,7 @@ def _test_apply_chat_template(
240240
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
241241

242242
if modality == "video":
243-
# qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
244-
# TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5VLProcessor
243+
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
245244
expected_video_token_count = 0
246245
for thw in out_dict["video_grid_thw"]:
247246
expected_video_token_count += thw[0] * thw[1] * thw[2]

tests/models/qwen2_vl/test_processor_qwen2_vl.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,7 @@ def _test_apply_chat_template(
240240
self.assertEqual(len(out_dict["input_ids"]), batch_size)
241241
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
242242
if modality == "video":
243-
# qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
244-
# TODO: update expected video token count calculation based on the internal processing logic of Qwen2VLProcessor
243+
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
245244
expected_video_token_count = 0
246245
for thw in out_dict["video_grid_thw"]:
247246
expected_video_token_count += thw[0] * thw[1] * thw[2]

0 commit comments

Comments
 (0)