Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions docs/models/pooling_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,6 @@ which takes priority over both the model's and Sentence Transformers's defaults.
The [LLM][vllm.LLM] class provides various methods for offline inference.
See [configuration][configuration] for a list of options when initializing the model.

### `LLM.encode`

The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
It returns the extracted hidden states directly, which is useful for reward models.

```python
from vllm import LLM

llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", runner="pooling")
(output,) = llm.encode("Hello, my name is")

data = output.outputs.data
print(f"Data: {data!r}")
```

### `LLM.embed`

The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
Expand All @@ -106,7 +91,7 @@ It is primarily designed for embedding models.
```python
from vllm import LLM

llm = LLM(model="intfloat/e5-mistral-7b-instruct", runner="pooling")
llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.embed("Hello, my name is")

embeds = output.outputs.embedding
Expand Down Expand Up @@ -154,6 +139,43 @@ print(f"Score: {score}")

A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>

### `LLM.reward`

The [reward][vllm.LLM.reward] method is available to all pooling models in vLLM.
It returns the extracted hidden states directly, which is useful for reward models.

```python
from vllm import LLM

llm = LLM(model="internlm/internlm2-1_8b-reward", runner="pooling", trust_remote_code=True)
(output,) = llm.reward("Hello, my name is")

data = output.outputs.data
print(f"Data: {data!r}")
```

### `LLM.encode`

The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
It returns the extracted hidden states directly.

!!! note
`LLM.encode` defaults to using `pooling_task = embed`.
- For embeddings, use `LLM.embed(...)`.
- For classification logits, use `LLM.classify(...)`.
- For reward scores, use `LLM.reward(...)`.
- For similarity scores, use `LLM.score(...)`.

```python
from vllm import LLM

llm = LLM(model="intfloat/e5-small", runner="pooling")
(output,) = llm.encode("Hello, my name is")

data = output.outputs.data
print(f"Data: {data!r}")
```

## Online Serving

Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,8 +1049,8 @@ def embed(self,
req_outputs = self.llm.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs]

def encode(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.llm.encode(prompts)
def reward(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.llm.reward(prompts)
return [req_output.outputs.data for req_output in req_outputs]

def score(
Expand Down
2 changes: 1 addition & 1 deletion tests/models/language/pooling/test_reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def test_prm_models(
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")

with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(math_step_prompts)
vllm_outputs = vllm_model.reward(math_step_prompts)

with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_model = step_reward_patch_hf_model(hf_model)
Expand Down
6 changes: 3 additions & 3 deletions tests/models/language/pooling/test_truncation_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,

with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode(
vllm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)

prompt_tokens = vllm_output[0].prompt_token_ids
Expand All @@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,

with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode(
vllm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)

prompt_tokens = vllm_output[0].prompt_token_ids
Expand All @@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:

llm_output = vllm_model.llm.encode(
llm_output = vllm_model.llm.embed(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)

assert llm_output == f"""truncate_prompt_tokens value
Expand Down
51 changes: 50 additions & 1 deletion vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,7 @@
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
pooling_task: PoolingTask = "encode",
pooling_task: Optional[PoolingTask] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[PoolingRequestOutput]:
"""Apply pooling to the hidden states corresponding to the input
Expand Down Expand Up @@ -1092,6 +1092,16 @@
considered legacy and may be deprecated in the future. You should
instead pass them via the `inputs` parameter.
"""
if pooling_task is None:
logger.warning(
"`LLM.encode` defaults to using `pooling_task = embed`.\n"
"Please use one of the more specific methods instead of `encode`:\n"

Check failure on line 1098 in vllm/entrypoints/llm.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/entrypoints/llm.py:1098:81: E501 Line too long (84 > 80)
" - For embeddings, use `LLM.embed(...)`.\n"
" - For classification logits, use `LLM.classify(...)`.\n"
" - For reward scores, use `LLM.reward(...)`.\n"
" - For similarity scores, use `LLM.score(...)`.")
pooling_task = "embed"

model_config = self.llm_engine.model_config
runner_type = model_config.runner_type
if runner_type != "pooling":
Expand Down Expand Up @@ -1230,6 +1240,45 @@

return [ClassificationRequestOutput.from_base(item) for item in items]

def reward(
self,
prompts: Union[PromptType, Sequence[PromptType]],
/,
*,
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
pooling_params: Optional[Union[PoolingParams,
Sequence[PoolingParams]]] = None,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[PoolingRequestOutput]:
"""
Generate reward scores for each prompt.

Args:
prompts: The prompts to the LLM. You may pass a sequence of prompts
for batch inference. See [PromptType][vllm.inputs.PromptType]
for more details about the format of each prompts.
use_tqdm: If `True`, shows a tqdm progress bar.
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
it is used to create the progress bar.
If `False`, no progress bar is created.
lora_request: LoRA request to use for generation, if any.
pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters.
Returns:
A list of `PoolingRequestOutput` objects containing the
pooled hidden states in the same order as the input prompts.
"""

return self.encode(
prompts,
use_tqdm=use_tqdm,
lora_request=lora_request,
pooling_params=pooling_params,
truncate_prompt_tokens=truncate_prompt_tokens,
pooling_task="encode",
)

def _embedding_score(
self,
tokenizer: AnyTokenizer,
Expand Down
Loading