Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions docs/features/multimodal_inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -359,13 +359,19 @@ Full example: [examples/offline_inference/audio_language.py](../../examples/offl
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.

You must enable this feature via `enable_mm_embeds=True`.

!!! warning
The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!

??? code

```python
from vllm import LLM

# Inference with image embeddings as input
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
llm = LLM(model="llava-hf/llava-1.5-7b-hf", enable_mm_embeds=True)

# Refer to the HuggingFace repo for the correct format to use
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
Expand Down Expand Up @@ -397,7 +403,11 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
image_embeds = torch.load(...)

# Qwen2-VL
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
llm = LLM(
"Qwen/Qwen2-VL-2B-Instruct",
limit_mm_per_prompt={"image": 4},
enable_mm_embeds=True,
)
mm_data = {
"image": {
"image_embeds": image_embeds,
Expand All @@ -407,7 +417,12 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
}

# MiniCPM-V
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
llm = LLM(
"openbmb/MiniCPM-V-2_6",
trust_remote_code=True,
limit_mm_per_prompt={"image": 4},
enable_mm_embeds=True,
)
mm_data = {
"image": {
"image_embeds": image_embeds,
Expand Down Expand Up @@ -732,7 +747,13 @@ Full example: [examples/online_serving/openai_chat_completion_client_for_multimo
### Embedding Inputs

To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
pass a tensor of shape to the corresponding field of the multi-modal dictionary.
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.

You must enable this feature via the `--enable-mm-embeds` flag in `vllm serve`.

!!! warning
The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!

#### Image Embedding Inputs

Expand Down
6 changes: 5 additions & 1 deletion docs/features/prompt_embeds.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@ You can pass prompt embeddings from Hugging Face Transformers models to the `'p

## Online Serving

Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package and are enabled by the `--enable-prompt-embeds` flag in `vllm serve`.

When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.

Prompt embeddings are passed in as base64 encoded torch tensors.

!!! warning
The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!

### Transformers Inputs via OpenAI Client

First, launch the OpenAI-compatible server:
Expand Down
1 change: 1 addition & 0 deletions examples/offline_inference/prithvi_geospatial_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(self, model):
dtype="float16",
enforce_eager=True,
model_impl="terratorch",
enable_mm_embeds=True,
)

def run(self, input_data, location_coords):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def main():
max_num_seqs=32,
io_processor_plugin="prithvi_to_tiff",
model_impl="terratorch",
enable_mm_embeds=True,
)

pooling_params = PoolingParams(task="token_classify", activation=False)
Expand Down
1 change: 1 addition & 0 deletions examples/online_serving/prithvi_geospatial_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# --task embed --trust-remote-code
# --skip-tokenizer-init --enforce-eager
# --io-processor-plugin prithvi_to_tiff
# --enable-mm-embeds


def main():
Expand Down
17 changes: 16 additions & 1 deletion tests/entrypoints/llm/test_prompt_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
import torch

from vllm import LLM

Expand All @@ -12,8 +13,22 @@ def test_empty_prompt():
llm.generate([""])


@pytest.mark.skip_v1
def test_out_of_vocab_token():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match="out of vocabulary"):
llm.generate({"prompt_token_ids": [999999]})


def test_require_mm_embeds():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
enforce_eager=True,
enable_mm_embeds=False,
)
with pytest.raises(ValueError, match="--enable-mm-embeds"):
llm.generate(
{
"prompt": "<image>",
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
}
)
13 changes: 13 additions & 0 deletions tests/entrypoints/openai/test_completion_with_prompt_embeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,16 @@ async def test_prompt_logprobs_raises_error(
temperature=0.0,
extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
)


@pytest.mark.asyncio
async def test_empty_prompt_embeds(
client_with_prompt_embeds: openai.AsyncOpenAI,
) -> None:
await client_with_prompt_embeds.completions.create(
model=MODEL_NAME,
prompt="Hello",
max_tokens=5,
temperature=0.0,
extra_body={"prompt_embeds": []},
)
29 changes: 27 additions & 2 deletions tests/entrypoints/openai/test_prompt_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import io
from unittest.mock import Mock

# imports for structured outputs tests
import openai
Expand All @@ -10,7 +11,8 @@
import regex as re
import torch

from vllm.entrypoints.renderer import BaseRenderer
from vllm.config import ModelConfig
from vllm.entrypoints.renderer import CompletionRenderer

from ...utils import RemoteOpenAIServer

Expand Down Expand Up @@ -59,6 +61,10 @@ async def test_out_of_vocab_token_ids():
def test_load_prompt_embeds(
dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
):
model_config = Mock(spec=ModelConfig)
model_config.enable_prompt_embeds = True
renderer = CompletionRenderer(model_config, tokenizer=None)

# construct arbitrary tensors of various dtypes, layouts, and sizes.
# We need to check against different layouts to make sure that if a user
# uses sparse tensors to reduce the transmission size of prompt embeddings,
Expand All @@ -83,11 +89,30 @@ def test_load_prompt_embeds(
buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue())

loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
assert len(loaded_prompt_embeds) == 1
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
assert loaded_tensor.device.type == "cpu"
assert loaded_tensor.layout == torch.strided
torch.testing.assert_close(
loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
)


@pytest.mark.parametrize("dtype", [torch.float32])
@pytest.mark.parametrize("seq_len", [2])
@pytest.mark.parametrize("hidden_size", [2])
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
model_config = Mock(spec=ModelConfig)
model_config.enable_prompt_embeds = False
renderer = CompletionRenderer(model_config, tokenizer=None)

tensor = torch.randn((seq_len, hidden_size), dtype=dtype)

buffer = io.BytesIO()
torch.save(tensor, buffer)
buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue())

with pytest.raises(ValueError, match="--enable-prompt-embeds"):
renderer.load_prompt_embeds(encoded_tensor)
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,7 @@
DTYPE = "float16"


@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--skip-tokenizer-init",
"--max-num-seqs",
"32",
"--model-impl",
"terratorch",
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
def _terratorch_dummy_inputs(model_name: str):
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)

Expand All @@ -54,7 +31,7 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8")

prompt = {
return {
"model": model_name,
"additional_data": {"prompt_token_ids": [1]},
"encoding_format": "base64",
Expand All @@ -74,12 +51,33 @@ async def test_single_request(server: RemoteOpenAIServer, model_name: str):
],
}

# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()

output = response.json()["data"][0]["data"]
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(model_name: str):
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--max-num-seqs",
"32",
"--model-impl",
"terratorch",
"--skip-tokenizer-init",
"--enable-mm-embeds",
]

with RemoteOpenAIServer(MODEL_NAME, args) as server:
prompt = _terratorch_dummy_inputs(model_name)

# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()

np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
output = response.json()["data"][0]["data"]

assert len(np_response) == 524288
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
assert len(np_response) == 524288
21 changes: 17 additions & 4 deletions tests/entrypoints/test_chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ def phi3v_model_config_mm_interleaved():
)


@pytest.fixture(scope="function")
def phi3v_model_config_image_embeds():
return ModelConfig(
PHI3V_MODEL_ID,
runner="generate",
trust_remote_code=True,
limit_mm_per_prompt={
"image": 2,
},
enable_mm_embeds=True,
)


@pytest.fixture(scope="module")
def phi3v_tokenizer():
return get_tokenizer(PHI3V_MODEL_ID)
Expand Down Expand Up @@ -799,7 +812,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(


def test_parse_chat_messages_empty_image_embeds_with_uuid(
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
):
uuid = "abcd"
Expand All @@ -813,7 +826,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
],
}
],
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
content_format="string",
)
Expand All @@ -832,7 +845,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(

@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
):
uuid = "abcd"
Expand All @@ -846,7 +859,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
],
}
],
phi3v_model_config,
phi3v_model_config_image_embeds,
phi3v_tokenizer,
content_format="string",
)
Expand Down
1 change: 1 addition & 0 deletions tests/entrypoints/test_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
class MockModelConfig:
max_model_len: int = 100
encoder_config: dict | None = None
enable_prompt_embeds: bool = True


class MockTokenizerResult:
Expand Down
3 changes: 1 addition & 2 deletions tests/models/multimodal/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@
limit_mm_per_prompt={"image": 4},
)
],
# TODO: Revert to "auto" when CPU backend can use torch > 2.6
dtype="bfloat16" if current_platform.is_cpu() else "auto",
vllm_runner_kwargs={"enable_mm_embeds": True},
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"qwen2_5_vl": VLMTestInfo(
Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/generation/test_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ def run_embedding_input_test(
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
default_torch_num_threads=1,
enable_mm_embeds=True,
) as vllm_model:
outputs_per_case_for_original_input = [
vllm_model.generate_greedy_logprobs(
Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/pooling/test_prithvi_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def _run_test(
dtype="half",
enforce_eager=True,
skip_tokenizer_init=True,
enable_mm_embeds=True,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs=32,
Expand Down
Loading