|
35 | 35 | REQUIRES_V0_MODELS = [
|
36 | 36 | # V1 Test: not enough KV cache space in C1.
|
37 | 37 | "fuyu",
|
| 38 | + # V1 Test: Deadlock issue when processing mm_inputs |
| 39 | + "llava-onevision-transformers", |
38 | 40 | ]
|
39 | 41 |
|
40 | 42 | # yapf: disable
|
|
170 | 172 | hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
|
171 | 173 | marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
172 | 174 | ),
|
| 175 | + #### Transformers fallback to test |
| 176 | + ## To reduce test burden, we only test batching arbitrary image size |
| 177 | + # Dynamic image length and number of patches |
| 178 | + "llava-onevision-transformers": VLMTestInfo( |
| 179 | + models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], |
| 180 | + test_type=VLMTestType.IMAGE, |
| 181 | + prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 |
| 182 | + max_model_len=16384, |
| 183 | + hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 |
| 184 | + auto_cls=AutoModelForImageTextToText, |
| 185 | + vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, |
| 186 | + image_size_factors=[(0.25, 0.5, 1.0)], |
| 187 | + vllm_runner_kwargs={ |
| 188 | + "model_impl": "transformers", |
| 189 | + "disable_mm_preprocessor_cache": True, |
| 190 | + "enable_prefix_caching": False, |
| 191 | + }, |
| 192 | + marks=[pytest.mark.core_model], |
| 193 | + ), |
| 194 | + # FIXME(Isotr0py): Enable this test after |
| 195 | + # https://github.com/huggingface/transformers/pull/39470 released |
| 196 | + # "idefics3-transformers": VLMTestInfo( |
| 197 | + # models=["HuggingFaceTB/SmolVLM-256M-Instruct"], |
| 198 | + # test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
| 199 | + # prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501 |
| 200 | + # img_idx_to_prompt=lambda idx: "<image>", |
| 201 | + # max_model_len=8192, |
| 202 | + # max_num_seqs=2, |
| 203 | + # auto_cls=AutoModelForImageTextToText, |
| 204 | + # hf_output_post_proc=model_utils.idefics3_trunc_hf_output, |
| 205 | + # image_size_factors=[(0.25, 0.5, 1.0)], |
| 206 | + # vllm_runner_kwargs={ |
| 207 | + # "model_impl": "transformers", |
| 208 | + # "disable_mm_preprocessor_cache": True, |
| 209 | + # "enable_prefix_caching": False, |
| 210 | + # }, |
| 211 | + # marks=[pytest.mark.core_model], |
| 212 | + # ), |
| 213 | + # Pixel values from processor are not 4D or 5D arrays |
| 214 | + "qwen2_5_vl-transformers": VLMTestInfo( |
| 215 | + models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
| 216 | + test_type=VLMTestType.IMAGE, |
| 217 | + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 |
| 218 | + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 |
| 219 | + max_model_len=4096, |
| 220 | + max_num_seqs=2, |
| 221 | + auto_cls=AutoModelForImageTextToText, |
| 222 | + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, |
| 223 | + image_size_factors=[(0.25, 0.2, 0.15)], |
| 224 | + vllm_runner_kwargs={ |
| 225 | + "model_impl": "transformers", |
| 226 | + "disable_mm_preprocessor_cache": True, |
| 227 | + "enable_prefix_caching": False, |
| 228 | + }, |
| 229 | + marks=[large_gpu_mark(min_gb=32)], |
| 230 | + ), |
| 231 | + # Check "auto" with fallback to transformers |
| 232 | + "internvl-transformers": VLMTestInfo( |
| 233 | + models=["OpenGVLab/InternVL3-1B-hf"], |
| 234 | + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
| 235 | + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 |
| 236 | + img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>", |
| 237 | + max_model_len=4096, |
| 238 | + use_tokenizer_eos=True, |
| 239 | + image_size_factors=[(0.25, 0.5, 1.0)], |
| 240 | + vllm_runner_kwargs={ |
| 241 | + "model_impl": "auto", |
| 242 | + "disable_mm_preprocessor_cache": True, |
| 243 | + "enable_prefix_caching": False, |
| 244 | + }, |
| 245 | + auto_cls=AutoModelForImageTextToText, |
| 246 | + marks=[pytest.mark.core_model], |
| 247 | + ), |
173 | 248 | #### Extended model tests
|
174 | 249 | "aria": VLMTestInfo(
|
175 | 250 | models=["rhymes-ai/Aria"],
|
|
0 commit comments