7
7
8
8
from fastapi import Request
9
9
10
+ from vllm import envs
10
11
from vllm .config import ModelConfig
11
12
from vllm .engine .protocol import EngineClient
12
13
from vllm .entrypoints .logger import RequestLogger
17
18
ScoreResponseData , UsageInfo )
18
19
from vllm .entrypoints .openai .serving_engine import OpenAIServing
19
20
from vllm .entrypoints .openai .serving_models import OpenAIServingModels
21
+ # yapf conflicts with isort for this block
22
+ # yapf: disable
20
23
from vllm .entrypoints .score_utils import (ScoreContentPartParam ,
21
24
ScoreMultiModalParam ,
22
25
_cosine_similarity ,
23
26
_validate_score_input_lens ,
27
+ compress_token_type_ids ,
24
28
get_score_prompt )
29
+ # yapf: enable
25
30
from vllm .entrypoints .utils import _validate_truncation_size
26
31
from vllm .inputs .data import TokensPrompt
27
32
from vllm .logger import init_logger
@@ -158,6 +163,8 @@ def _preprocess_score(
158
163
tokenizer = tokenizer ,
159
164
tokenization_kwargs = tokenization_kwargs ,
160
165
)
166
+ self ._validate_input (request , engine_prompt ["prompt_token_ids" ],
167
+ full_prompt )
161
168
if request .mm_processor_kwargs is not None :
162
169
engine_prompt ["mm_processor_kwargs" ] = request .mm_processor_kwargs
163
170
@@ -188,64 +195,27 @@ async def _cross_encoding_score(
188
195
189
196
input_pairs = [(t1 , t2 ) for t1 , t2 in zip (data_1 , data_2 )]
190
197
191
- if self .model_config .is_multimodal_model :
198
+ preprocess_async = make_async (self ._preprocess_score ,
199
+ executor = self ._tokenizer_executor )
192
200
193
- preprocess_async = make_async (self ._preprocess_score ,
194
- executor = self ._tokenizer_executor )
201
+ preprocessed_prompts = await asyncio .gather (
202
+ * (preprocess_async (request = request ,
203
+ tokenizer = tokenizer ,
204
+ tokenization_kwargs = tokenization_kwargs ,
205
+ data_1 = t1 ,
206
+ data_2 = t2 ) for t1 , t2 in input_pairs ))
195
207
196
- preprocessed_prompts = await asyncio .gather (
197
- * (preprocess_async (request = request ,
198
- tokenizer = tokenizer ,
199
- tokenization_kwargs = tokenization_kwargs ,
200
- data_1 = t1 ,
201
- data_2 = t2 ) for t1 , t2 in input_pairs ))
202
-
203
- for full_prompt , engine_prompt in preprocessed_prompts :
204
- request_prompts .append (full_prompt )
205
- engine_prompts .append (engine_prompt )
206
-
207
- else :
208
- tokenize_async = make_async (tokenizer .__call__ ,
209
- executor = self ._tokenizer_executor )
210
- use_pad_token = self .model_config .use_pad_token
211
-
212
- if use_pad_token :
213
- # cross_encoder models defaults to using pad_token.
214
- tokenized_prompts = await asyncio .gather (* (
215
- tokenize_async (
216
- text = t1 , # type: ignore[arg-type]
217
- text_pair = t2 , # type: ignore[arg-type]
218
- ** tokenization_kwargs ) for t1 , t2 in input_pairs ))
219
- else :
220
- # `llm as reranker` models defaults to not using pad_token.
221
- tokenized_prompts = await asyncio .gather (* (
222
- tokenize_async (
223
- text = t1 + # type: ignore[operator]
224
- t2 ,
225
- ** tokenization_kwargs ) for t1 , t2 in input_pairs ))
226
-
227
- for prompt_inputs , (t1 , t2 ) in zip (tokenized_prompts , input_pairs ):
228
- sep_token = tokenizer .sep_token if (tokenizer .sep_token
229
- and use_pad_token ) else ''
230
- request_prompt = f"{ t1 } { sep_token } { t2 } "
231
-
232
- input_ids = prompt_inputs ["input_ids" ]
233
- text_token_prompt = \
234
- self ._validate_input (request , input_ids , request_prompt )
235
- engine_prompt = TokensPrompt (
236
- prompt_token_ids = text_token_prompt ["prompt_token_ids" ],
237
- token_type_ids = prompt_inputs .get ("token_type_ids" ))
238
-
239
- request_prompts .append (request_prompt )
240
- engine_prompts .append (engine_prompt )
208
+ for full_prompt , engine_prompt in preprocessed_prompts :
209
+ request_prompts .append (full_prompt )
210
+ engine_prompts .append (engine_prompt )
241
211
242
212
# Schedule the request and get the result generator.
243
213
generators : list [AsyncGenerator [PoolingRequestOutput , None ]] = []
244
214
245
- pooling_params = request .to_pooling_params ()
215
+ default_pooling_params = request .to_pooling_params ()
246
216
247
217
try :
248
- pooling_params .verify ("score" , self .model_config )
218
+ default_pooling_params .verify ("score" , self .model_config )
249
219
except ValueError as e :
250
220
return self .create_error_response (str (e ))
251
221
@@ -254,9 +224,19 @@ async def _cross_encoding_score(
254
224
255
225
self ._log_inputs (request_id_item ,
256
226
request_prompts [i ],
257
- params = pooling_params ,
227
+ params = default_pooling_params ,
258
228
lora_request = lora_request )
259
229
230
+ if envs .VLLM_USE_V1 and (token_type_ids := engine_prompt .pop (
231
+ "token_type_ids" , None )):
232
+ pooling_params = default_pooling_params .clone ()
233
+ compressed = compress_token_type_ids (token_type_ids )
234
+ pooling_params .extra_kwargs = {
235
+ "compressed_token_type_ids" : compressed
236
+ }
237
+ else :
238
+ pooling_params = (default_pooling_params )
239
+
260
240
generator = self .engine_client .encode (
261
241
engine_prompt ,
262
242
pooling_params ,
0 commit comments