Skip to content

Commit 1630cc8

Browse files
authored
[Benchmarks] Add video inputs to ShareGPTDataset. (#23199)
Signed-off-by: Chenheli Hua <[email protected]>
1 parent 14e2b07 commit 1630cc8

File tree

3 files changed

+113
-6
lines changed

3 files changed

+113
-6
lines changed

benchmarks/README.md

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@ become available.
3232
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
3333
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
3434
</td>
35+
</tr>
36+
<tr>
37+
<td><strong>ShareGPT4Video (Video)</strong></td>
38+
<td style="text-align: center;">✅</td>
39+
<td style="text-align: center;">✅</td>
40+
<td>
41+
<code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
42+
</td>
3543
</tr>
3644
<tr>
3745
<td><strong>BurstGPT</strong></td>
@@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
231239
```bash
232240
vllm bench serve \
233241
--backend openai-chat \
234-
--endpoint-type openai-chat \
242+
--endpoint-type openai-chat \
235243
--model Qwen/Qwen2-VL-7B-Instruct \
236244
--endpoint /v1/chat/completions \
237245
--dataset-name hf \
@@ -246,7 +254,7 @@ vllm bench serve \
246254
```bash
247255
vllm bench serve \
248256
--backend openai-chat \
249-
--endpoint-type openai-chat \
257+
--endpoint-type openai-chat \
250258
--model Qwen/Qwen2-VL-7B-Instruct \
251259
--endpoint /v1/chat/completions \
252260
--dataset-name hf \
@@ -612,7 +620,7 @@ vllm bench serve \
612620
--prefix-repetition-prefix-len 512 \
613621
--prefix-repetition-suffix-len 128 \
614622
--prefix-repetition-num-prefixes 5 \
615-
--prefix-repetition-output-len 128
623+
--prefix-repetition-output-len 128
616624
```
617625

618626
</details>
@@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
687695
--endpoint /v1/chat/completion
688696
```
689697

698+
### Videos (ShareGPT4Video)
699+
700+
Start vLLM:
701+
702+
```bash
703+
python -m vllm.entrypoints.openai.api_server \
704+
--model Qwen/Qwen2.5-VL-7B-Instruct \
705+
--dtype bfloat16 \
706+
--limit-mm-per-prompt '{"video": 1}' \
707+
--allowed-local-media-path /path/to/sharegpt4video/videos
708+
```
709+
710+
Send requests with videos:
711+
712+
```bash
713+
python benchmarks/benchmark_serving.py \
714+
--backend openai-chat \
715+
--model Qwen/Qwen2.5-VL-7B-Instruct \
716+
--dataset-name sharegpt \
717+
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
718+
--num-prompts 100 \
719+
--save-result \
720+
--result-dir ~/vllm_benchmark_results \
721+
--save-detailed \
722+
--endpoint /v1/chat/completion
723+
```
724+
690725
</details>

benchmarks/benchmark_dataset.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
293293
)
294294

295295

296+
def process_video(video: Any) -> Mapping[str, Any]:
297+
"""
298+
Process a single video input and return a multimedia content dictionary.
299+
300+
Supports the following input types:
301+
302+
1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
303+
containing raw video data.
304+
305+
2. String input: - Treats the string as a URL or local file path. -
306+
Prepends "file://" if the string doesn't start with "http://" or
307+
"file://". - Returns a dictionary with the image URL.
308+
309+
Raises:
310+
ValueError: If the input is not a supported type.
311+
"""
312+
if isinstance(video, dict) and "bytes" in video:
313+
video_bytes = video["bytes"]
314+
video_base64 = base64.b64encode(video_bytes).decode("utf-8")
315+
return {
316+
"type": "video_url",
317+
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
318+
}
319+
320+
if isinstance(video, str):
321+
video_url = (
322+
video if video.startswith(("http://", "file://")) else f"file://{video}"
323+
)
324+
return {"type": "video_url", "video_url": {"url": video_url}}
325+
326+
raise ValueError(
327+
f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
328+
)
329+
330+
296331
# -----------------------------------------------------------------------------
297332
# Random Dataset Implementation (Synthetic Data)
298333
# -----------------------------------------------------------------------------
@@ -451,9 +486,10 @@ def sample(
451486
skip_min_output_len_check=output_len is not None,
452487
):
453488
continue
454-
# TODO: Also support ShareGPT4Video.
455489
if image_path := entry.get("image"):
456490
mm_content = process_image(image_path)
491+
elif video_path := entry.get("video"):
492+
mm_content = process_video(video_path)
457493
else:
458494
mm_content = None
459495
if enable_multimodal_chat:

vllm/benchmarks/datasets.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
281281
"""
282282
Process a single image input and return a multimedia content dictionary.
283283
284-
Supports three input types:
284+
Supports the following input types:
285285
286286
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
287287
containing raw image data. - Loads the bytes as a PIL.Image.Image.
@@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
321321
" or str or dictionary with raw image bytes.")
322322

323323

324+
def process_video(video: Any) -> Mapping[str, Any]:
325+
"""
326+
Process a single video input and return a multimedia content dictionary.
327+
328+
Supports the following input types:
329+
330+
1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
331+
containing raw video data.
332+
333+
2. String input: - Treats the string as a URL or local file path. -
334+
Prepends "file://" if the string doesn't start with "http://" or
335+
"file://". - Returns a dictionary with the image URL.
336+
337+
Raises:
338+
ValueError: If the input is not a supported type.
339+
"""
340+
if isinstance(video, dict) and 'bytes' in video:
341+
video_bytes = video['bytes']
342+
video_base64 = base64.b64encode(video_bytes).decode("utf-8")
343+
return {
344+
"type": "video_url",
345+
"video_url": {
346+
"url": f"data:video/mp4;base64,{video_base64}"
347+
},
348+
}
349+
350+
if isinstance(video, str):
351+
video_url = (video if video.startswith(
352+
("http://", "file://")) else f"file://{video}")
353+
return {"type": "video_url", "video_url": {"url": video_url}}
354+
355+
raise ValueError(
356+
f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
357+
)
358+
324359
# -----------------------------------------------------------------------------
325360
# Random Dataset Implementation (Synthetic Data)
326361
# -----------------------------------------------------------------------------
@@ -474,9 +509,10 @@ def sample(
474509
skip_min_output_len_check=output_len
475510
is not None):
476511
continue
477-
# TODO: Also support ShareGPT4Video.
478512
if image_path := entry.get("image"):
479513
mm_content = process_image(image_path)
514+
elif video_path := entry.get("video"):
515+
mm_content = process_video(video_path)
480516
else:
481517
mm_content = None
482518
if enable_multimodal_chat:

0 commit comments

Comments
 (0)