update documentation & make fixup

akibjawad · akibjawad · commit cf3df35682ea · 2025-07-31T17:01:54.000-04:00
diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md
@@ -111,6 +111,7 @@ Some vision models also support video inputs. The message format is very similar
 
 - The content `"type"` should be `"video"` to indicate the content is a video.
 - For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
+- In addition to loading videos from a URL or file path, you can also pass decoded video data directly. This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
 
 > [!WARNING]
 > Loading a video from `"url"` is only supported by the PyAV or Decord backends.
@@ -137,27 +138,11 @@ messages = [
 ]
 ```
 
-### Passing decoded video objects
-In addition to loading videos from a URL or file path, you can also pass decoded video data directly. 
-
-This is useful if you’ve already preprocessed or decoded video frames elsewhere in memory (e.g., using OpenCV, decord, or torchvision). You don't need to save to files or store it in an URL.
-
-- Use the `"video"` type with a dictionary that includes:
-    - `"frames"` (`np.ndarray` or `torch.Tensor`):
-        A 4D array of shape (num_frames, channels, height, width) containing decoded video frames.
-    - `"metadata"` (`"VideoMetadata"` or `"dict"`):
-        Describes metadata for the video. If you provide a dictionary, it must include at least one of:
-        - `"fps"` (frames per second)
-        - `"duration"` (video duration in seconds)
-        if both `"fps"` and `"duration"` is provided, `"fps"` gets priority and `"duration"` is calculated based on `"fps"`
-
+### Example: Passing decoded video objects
 ```python
 import numpy as np
 
-video_object1 = {
-    "frames": np.random.randint(0, 255, size=(16, 3, 224, 224), dtype=np.uint8),
-    "metadata": {"fps": 16, "duration": 2.0}
-}
+video_object1 = np.random.randint(0, 255, size=(16, 224, 224, 3), dtype=np.uint8),
 
 messages = [
     {
@@ -180,15 +165,10 @@ You can also use existing (`"load_video()"`) function to load a video, edit the
 from transformers.video_utils import load_video
 
 # load a video file in memory for testing
-frames, metadata = load_video(
+video_object2, _ = load_video(
     "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
 )
 
-video_object2 = {
-    "frames": frames,
-    "metadata": metadata,
-}
-
 messages = [
     {
         "role": "system",
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -38,14 +38,12 @@
 from .feature_extraction_utils import BatchFeature
 from .image_utils import ChannelDimension, is_vision_available, load_image
 from .utils.chat_template_utils import render_jinja_template
-from .video_utils import VideoMetadata, convert_pil_frames_to_video, load_video
+from .video_utils import VideoMetadata, load_video
 
 
 if is_vision_available():
     from .image_utils import PILImageResampling
 
-if is_torch_available():
-    import torch
 
 from .tokenization_utils_base import (
     PaddingStrategy,
@@ -68,7 +66,6 @@
     download_url,
     is_offline_mode,
     is_remote_url,
-    is_torch_available,
     list_repo_templates,
     logging,
 )
@@ -1578,11 +1575,6 @@ def apply_chat_template(
                                 fname,
                                 backend=mm_load_kwargs["video_load_backend"],
                             )
-                            if metadata is None:
-                                logger.warning(
-                                    "When loading the video from list of decoded frames, we cannot infer metadata such as `fps` or `duration`. "
-                                    "If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
-                                )
                         videos.append(video)
                         video_metadata.append(metadata)
 
diff --git a/src/transformers/video_utils.py b/src/transformers/video_utils.py
@@ -563,14 +563,18 @@ def sample_indices_fn_func(metadata, **fn_kwargs):
 
         sample_indices_fn = sample_indices_fn_func
 
-    if isinstance(video, Union[np.ndarray, torch.Tensor]):
+    if is_valid_image(video) or (isinstance(video, (list, tuple)) and is_valid_image(video[0])):
         if not is_valid_video(video):
             raise ValueError(
                 f"When passing video as decoded frames, video should be a 4D numpy array or torch tensor, but got {video.ndim} dimensions instead."
             )
         # Case 1: Video is provided as a 4D numpy array or torch tensor (frames, height, width, channels)
         if is_torch_tensor(video):
             video = video.numpy()  # Convert torch tensor to numpy array
+        logger.warning(
+            "When loading the video from list of decoded frames, we cannot infer metadata such as `fps` or `duration`. "
+            "If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
+        )
         return video, None
 
     if urlparse(video).netloc in ["www.youtube.com", "youtube.com"]:
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
@@ -63,9 +63,7 @@
     from transformers.video_utils import load_video
 
     # load a video file in memory for testing
-    video, _ = load_video(
-        "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"
-    )
+    video, _ = load_video("https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4")
     MODALITY_INPUT_DATA["videos"].append(video)