Skip to content

Commit 2aa4140

Browse files
hujiaxin0Emilie1001Isotr0py
authored
openpangu-vl support video input (vllm-project#34134)
Signed-off-by: hujiaxin <524446785@qq.com> Signed-off-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Co-authored-by: Emilie1001 <79921183+Emilie1001@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
1 parent 86c3b5a commit 2aa4140

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

vllm/multimodal/video.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,3 +747,90 @@ def load_bytes(
747747
**kwargs,
748748
)
749749
return out
750+
751+
752+
@VIDEO_LOADER_REGISTRY.register("openpangu")
753+
class OpenCVDynamicOpenPanguVideoBackend(OpenCVVideoBackend):
754+
@classmethod
755+
def load_bytes(
756+
cls,
757+
data: bytes,
758+
num_frames: int = 32,
759+
fps: int = 1,
760+
max_duration: int = 300,
761+
frame_recovery: bool = False,
762+
**kwargs,
763+
) -> tuple[npt.NDArray, dict[str, Any]]:
764+
"""
765+
Load video frames with dynamic sampling based on duration.
766+
Assume that total_num_frames = 10 and fps = 1.
767+
The timestamp of frame 0 is 0.0.
768+
The timestamp of frame 1 is 1.0.…
769+
The timestamp of frame 9 (the last frame) should be 9.0, that is,
770+
(total_frames_num – 1) / original_fps.
771+
772+
Args:
773+
data: Raw video bytes
774+
num_frames: Not used in dynamic backend
775+
fps: Target FPS for sampling (default: 1)
776+
777+
Returns:
778+
Tuple of (frames_array, metadata_dict)
779+
"""
780+
import cv2
781+
782+
backend = cls().get_cv2_video_api()
783+
cap = cv2.VideoCapture(BytesIO(data), backend, [])
784+
if not cap.isOpened():
785+
raise ValueError("Could not open video stream")
786+
787+
total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
788+
original_fps = float(cap.get(cv2.CAP_PROP_FPS))
789+
# The timestamp of the rightmost frame, cannot be used to calculate frame 0.
790+
if total_frames_num >= 1 and original_fps > 0:
791+
total_duration = (total_frames_num - 1) / original_fps
792+
else:
793+
total_duration = 0
794+
795+
# `fps` is the FPS parameter passed in for sampling,
796+
# -1 indicates that sampling can be performed directly without FPS limitation.
797+
if fps > 0:
798+
# Num_frames is the maximum number of frames to sample.
799+
# If fewer frames are sampled at this sample_fps, the update duration will be longer. # noqa: E501
800+
if num_frames >= int(total_duration * fps) + 1:
801+
num_frames = int(total_duration * fps) + 1
802+
# Under the new maximum frame rate, the video duration of the rightmost frame, # noqa: E501
803+
# cannot be calculated for frame 0.
804+
total_duration = min(total_duration, (num_frames - 1) / fps)
805+
elif fps != -1:
806+
raise ValueError(
807+
f"requires dataset fps is -1 or greater than 0 but got {fps}"
808+
)
809+
810+
sample_frame_timestamps = np.linspace(
811+
0, total_duration, num_frames, dtype=float
812+
)
813+
frames_indices = [
814+
min(total_frames_num - 1, round(t * original_fps))
815+
for t in sample_frame_timestamps
816+
]
817+
818+
frames, valid_frame_indices, recovered_map = cls._read_frames_with_recovery(
819+
cap, frames_indices, total_frames_num
820+
)
821+
822+
if recovered_map:
823+
logger.info(
824+
"Frame recovery: %d frames recovered using forward scan.",
825+
len(recovered_map),
826+
)
827+
828+
metadata = {
829+
"total_num_frames": total_frames_num,
830+
"fps": original_fps,
831+
"duration": total_duration,
832+
"video_backend": "opencv_dynamic_openpangu",
833+
"frames_indices": valid_frame_indices,
834+
"do_sample_frames": False,
835+
}
836+
return frames, metadata

0 commit comments

Comments
 (0)