From 271086d8a0c6b094910e5e63467ae3cad2c1c68f Mon Sep 17 00:00:00 2001 From: Christopher Manteuffel Date: Fri, 22 Aug 2025 11:52:52 -0700 Subject: [PATCH 01/13] Initial import of ovis_2_5 work --- .../vision_language_multi_image.py | 30 + .../generation/vlm_utils/model_utils.py | 57 ++ vllm/entrypoints/chat_utils.py | 2 +- vllm/model_executor/models/ovis2_5.py | 567 ++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/siglip2navit.py | 606 ++++++++++++++++++ vllm/transformers_utils/config.py | 14 + .../transformers_utils/processors/__init__.py | 3 +- vllm/transformers_utils/processors/ovis2_5.py | 458 +++++++++++++ 9 files changed, 1736 insertions(+), 2 deletions(-) create mode 100644 vllm/model_executor/models/ovis2_5.py create mode 100644 vllm/model_executor/models/siglip2navit.py create mode 100644 vllm/transformers_utils/processors/ovis2_5.py diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index ea7a793d026b..bd71b021320d 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -459,6 +459,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData: image_data=[fetch_image(url) for url in image_urls], ) +# ovis2_5 +def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "AIDC-AI/Ovis2.5-2B" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + trust_remote_code=True, + dtype="half", + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + import pdb;pdb.set_trace() + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "mistral-community/pixtral-12b" diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index af4c72f44b67..ce62e95bdf97 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -787,6 +787,63 @@ def processor(*args, text="", images=None, **kwargs): hf_model.processor = processor return hf_model +def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for Ovis2.""" + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.llm.get_output_embeddings() + + def processor(*args, text="", images=None, videos=None, **kwargs): + if images is None: + images = [] + else: + images = [images] if isinstance(images, Image) else images + if videos is None: + videos = [] + else: + videos = [videos] if isinstance(videos, np.ndarray) else videos + videos = [[PIL.Image.fromarray(frame) for frame in vid] + for vid in videos] + + import pdb;pdb.set_trace() + prompt_start_and_end = { + "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"), + "llama": + ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"), + "gemma2": ("user\n", "\n"), + } + for start, end in prompt_start_and_end.values(): + if start in text and end in text: + text = text.split(start)[1].split(end)[0] + break + + images_message = [{"type": "image", "image": img} for img in images] + videos_message = [{"type": "video", "video": vid} for vid in videos] + + messages = [{ + "role": + "user", + "content": [ + *images_message, + *videos_message, + { + "type": "text", + "text": text + }, + ], + }] + + input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs( + messages=messages, enable_thinking=True) + inputs = { + "inputs": input_ids, + "pixel_values": pixel_values, + "grid_thws": grid_thws, + } + return BatchFeature(data=inputs, tensor_type="pt") + + hf_model.processor = processor + return hf_model + def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner: """Patches and returns an instance of the HfRunner for Qwen2.5-Omni.""" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f5f45a62ca2f..83d66c0d662f 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -527,7 +527,7 @@ def _placeholder_str(self, modality: ModalityStr, if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2", "internvl_chat", "ovis", "skywork_chat", - "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"): + "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm", "ovis2_5"): return "" if model_type in ("mllama", "llama4"): return "<|image|>" diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py new file mode 100644 index 000000000000..b6fbd9ab82d2 --- /dev/null +++ b/vllm/model_executor/models/ovis2_5.py @@ -0,0 +1,567 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" PyTorch Ovis model.""" +from collections.abc import Iterable, Mapping +from functools import partial +from typing import Optional, Union + +import torch +import torch.nn as nn +from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig + +from vllm.config import VllmConfig +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.models.ovis import (OvisImagePatchInputs, + VisualEmbedding) +from vllm.model_executor.models.siglip2navit import Siglip2NavitModel +from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, + init_vllm_registered_model, + maybe_prefix) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor + +from .interfaces import MultiModalEmbeddings, SupportsMultiModal + +IMAGE_TOKEN = "" +VIDEO_TOKEN = "