Address review comments

akibjawad · akibjawad · commit db06f5ba295e · 2025-08-02T05:27:21.000-07:00
diff --git a/src/transformers/video_utils.py b/src/transformers/video_utils.py
@@ -564,17 +564,11 @@ def sample_indices_fn_func(metadata, **fn_kwargs):
         sample_indices_fn = sample_indices_fn_func
 
     if is_valid_image(video) or (isinstance(video, (list, tuple)) and is_valid_image(video[0])):
+        # Case 1: Video is provided as a 4D numpy array or torch tensor (frames, height, width, channels)
         if not is_valid_video(video):
             raise ValueError(
                 f"When passing video as decoded frames, video should be a 4D numpy array or torch tensor, but got {video.ndim} dimensions instead."
             )
-        # Case 1: Video is provided as a 4D numpy array or torch tensor (frames, height, width, channels)
-        if is_torch_tensor(video):
-            video = video.numpy()  # Convert torch tensor to numpy array
-        logger.warning(
-            "When loading the video from list of decoded frames, we cannot infer metadata such as `fps` or `duration`. "
-            "If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
-        )
         return video, None
 
     if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
diff --git a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -423,8 +423,7 @@ def _test_apply_chat_template(
         self.assertEqual(len(out_dict["attention_mask"]), batch_size)
 
         if modality == "video":
-            # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
-            # TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5OmniProcessor
+            # qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
             expected_video_token_count = 0
             for thw in out_dict["video_grid_thw"]:
                 expected_video_token_count += thw[0] * thw[1] * thw[2]
diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -240,8 +240,7 @@ def _test_apply_chat_template(
         self.assertEqual(len(out_dict["attention_mask"]), batch_size)
 
         if modality == "video":
-            # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
-            # TODO: update expected video token count calculation based on the internal processing logic of Qwen2_5VLProcessor
+            # qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
             expected_video_token_count = 0
             for thw in out_dict["video_grid_thw"]:
                 expected_video_token_count += thw[0] * thw[1] * thw[2]
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -240,8 +240,7 @@ def _test_apply_chat_template(
         self.assertEqual(len(out_dict["input_ids"]), batch_size)
         self.assertEqual(len(out_dict["attention_mask"]), batch_size)
         if modality == "video":
-            # qwen pixels don't scale with bs same way as other models, calulate expected video token count based on video_grid_thw
-            # TODO: update expected video token count calculation based on the internal processing logic of Qwen2VLProcessor
+            # qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
             expected_video_token_count = 0
             for thw in out_dict["video_grid_thw"]:
                 expected_video_token_count += thw[0] * thw[1] * thw[2]