28
28
from modelscope import snapshot_download # type: ignore[import-untyped]
29
29
from vllm import SamplingParams
30
30
from vllm .assets .image import ImageAsset
31
+ from vllm .assets .audio import AudioAsset
31
32
32
33
import vllm_ascend # noqa: F401
33
34
from tests .e2e .conftest import VllmRunner
36
37
"Qwen/Qwen2.5-0.5B-Instruct" ,
37
38
"Qwen/Qwen3-0.6B-Base" ,
38
39
]
39
- MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
40
+ MULTIMODALITY_VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
41
+ MULTIMODALITY_AUDIO_MODELS = ["Qwen/Qwen2-Audio-7B-Instruct" ]
40
42
41
43
QUANTIZATION_MODELS = [
42
44
"vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" ,
43
45
]
44
46
os .environ ["PYTORCH_NPU_ALLOC_CONF" ] = "max_split_size_mb:256"
47
+ AUDIO_ASSETS = [AudioAsset ("mary_had_lamb" ), AudioAsset ("winning_call" )]
48
+ AUDIO_PROMPT_TEMPLATES = {
49
+ 1 : "What is recited in the audio?" ,
50
+ 2 : "What sport and what nursery rhyme are referenced?"
51
+ }
45
52
46
53
47
54
@pytest .mark .parametrize ("model" , MODELS )
@@ -84,8 +91,8 @@ def test_quantization_models(model: str, max_tokens: int) -> None:
84
91
vllm_model .generate_greedy (example_prompts , max_tokens )
85
92
86
93
87
- @pytest .mark .parametrize ("model" , MULTIMODALITY_MODELS )
88
- def test_multimodal (model , prompt_template , vllm_runner ):
94
+ @pytest .mark .parametrize ("model" , MULTIMODALITY_VL_MODELS )
95
+ def test_multimodal_vl (model , prompt_template , vllm_runner ):
89
96
image = ImageAsset ("cherry_blossom" ) \
90
97
.pil_image .convert ("RGB" )
91
98
img_questions = [
@@ -108,6 +115,47 @@ def test_multimodal(model, prompt_template, vllm_runner):
108
115
max_tokens = 64 )
109
116
110
117
118
+ def prepare_audio_inputs (audio_count : int ):
119
+ audio_prompt = "" .join ([
120
+ f"Audio { idx + 1 } : <|audio_bos|><|AUDIO|><|audio_eos|>\n "
121
+ for idx in range (audio_count )
122
+ ])
123
+ question = AUDIO_PROMPT_TEMPLATES [audio_count ]
124
+ prompt = ("<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
125
+ "<|im_start|>user\n "
126
+ f"{ audio_prompt } { question } <|im_end|>\n "
127
+ "<|im_start|>assistant\n " )
128
+ mm_data = {
129
+ "audio" :
130
+ [asset .audio_and_sample_rate for asset in AUDIO_ASSETS [:audio_count ]]
131
+ }
132
+ inputs = {"prompt" : prompt , "multi_modal_data" : mm_data }
133
+ return inputs
134
+
135
+
136
+ @pytest .mark .parametrize ("model" , MULTIMODALITY_AUDIO_MODELS )
137
+ @pytest .mark .parametrize ("audio_count" , [2 ])
138
+ @pytest .mark .parametrize ("max_tokens" , [10 ])
139
+ def test_multimodal_audio (model : str , audio_count : int ,
140
+ max_tokens : int ) -> None :
141
+ inputs = prepare_audio_inputs (audio_count )
142
+
143
+ llm_config = {
144
+ "max_model_len" : 4096 ,
145
+ "max_num_seqs" : 5 ,
146
+ "limit_mm_per_prompt" : {
147
+ "audio" : audio_count
148
+ },
149
+ "gpu_memory_utilization" : 0.9
150
+ }
151
+
152
+ with VllmRunner (model , ** llm_config ) as vllm_model :
153
+ sampling_params = SamplingParams (temperature = 0.2 ,
154
+ max_tokens = max_tokens ,
155
+ stop_token_ids = None )
156
+ vllm_model .generate (inputs , sampling_params = sampling_params )
157
+
158
+
111
159
@patch .dict (os .environ , {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION" : "1" })
112
160
def test_models_topk () -> None :
113
161
example_prompts = [
0 commit comments