Skip to content

Commit 216fc0e

Browse files
authored
[feature] Prompt Embeddings Support for v1 Engine (#3026)
### What this PR does / why we need it? this PR based on [19746](vllm-project/vllm#19746), support Prompt Embeddings for v1 engine on NPU ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ```python python examples/prompt_embed_inference.py ``` - vLLM version: v0.11.0 - vLLM main: vllm-project/vllm@releases/v0.11.1 --------- Signed-off-by: jesse <[email protected]>
1 parent f6149f3 commit 216fc0e

File tree

5 files changed

+447
-17
lines changed

5 files changed

+447
-17
lines changed

.github/workflows/_e2e_test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ jobs:
8888
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
8989
# the test separately.
9090
91+
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
9192
pytest -sv tests/e2e/singlecard/test_aclgraph.py
9293
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
9394
pytest -sv tests/e2e/singlecard/test_bge_model.py

examples/prompt_embed_inference.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
"""
4+
Demonstrates how to generate prompt embeddings using
5+
Hugging Face Transformers and use them as input to vLLM
6+
for both single and batch inference.
7+
8+
Model: meta-llama/Llama-3.2-1B-Instruct
9+
Note: This model is gated on Hugging Face Hub.
10+
You must request access to use it:
11+
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
12+
13+
Requirements:
14+
- vLLM
15+
- transformers
16+
17+
Run:
18+
python examples/prompt_embed_inference.py
19+
"""
20+
21+
import torch
22+
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
23+
24+
from vllm import LLM
25+
26+
27+
def init_tokenizer_and_llm(model_name: str):
28+
llm = LLM(model=model_name, enable_prompt_embeds=True)
29+
tokenizer = AutoTokenizer.from_pretrained(model_name)
30+
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
31+
embedding_layer = transformers_model.get_input_embeddings()
32+
return tokenizer, embedding_layer, llm
33+
34+
35+
def get_prompt_embeds(
36+
chat: list[dict[str, str]],
37+
tokenizer: PreTrainedTokenizer,
38+
embedding_layer: torch.nn.Module,
39+
):
40+
token_ids = tokenizer.apply_chat_template(
41+
chat, add_generation_prompt=True, return_tensors="pt"
42+
)
43+
prompt_embeds = embedding_layer(token_ids).squeeze(0)
44+
return prompt_embeds
45+
46+
47+
def single_prompt_inference(
48+
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
49+
):
50+
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
51+
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
52+
53+
outputs = llm.generate(
54+
{
55+
"prompt_embeds": prompt_embeds,
56+
}
57+
)
58+
59+
print("\n[Single Inference Output]")
60+
print("-" * 30)
61+
for o in outputs:
62+
print(o.outputs[0].text)
63+
print("-" * 30)
64+
65+
66+
def batch_prompt_inference(
67+
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
68+
):
69+
chats = [
70+
[{"role": "user", "content": "Please tell me about the capital of France."}],
71+
[{"role": "user", "content": "When is the day longest during the year?"}],
72+
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}],
73+
]
74+
75+
prompt_embeds_list = [
76+
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
77+
]
78+
79+
outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list])
80+
81+
print("\n[Batch Inference Outputs]")
82+
print("-" * 30)
83+
for i, o in enumerate(outputs):
84+
print(f"Q{i + 1}: {chats[i][0]['content']}")
85+
print(f"A{i + 1}: {o.outputs[0].text}\n")
86+
print("-" * 30)
87+
88+
89+
def main():
90+
model_name = "meta-llama/Llama-3.2-1B-Instruct"
91+
tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
92+
single_prompt_inference(llm, tokenizer, embedding_layer)
93+
batch_prompt_inference(llm, tokenizer, embedding_layer)
94+
95+
96+
if __name__ == "__main__":
97+
main()
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
#
18+
import os
19+
20+
import pytest
21+
from transformers import AutoModelForCausalLM, AutoTokenizer
22+
23+
from tests.e2e.conftest import VllmRunner
24+
25+
os.environ["VLLM_USE_MODELSCOPE"] = "True"
26+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
27+
28+
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
29+
30+
31+
def get_prompt_embeds(chat, tokenizer, embedding_layer):
32+
"""Convert chat messages to prompt embeddings."""
33+
token_ids = tokenizer.apply_chat_template(chat,
34+
add_generation_prompt=True,
35+
return_tensors='pt')
36+
prompt_embeds = embedding_layer(token_ids).squeeze(0)
37+
return prompt_embeds
38+
39+
40+
@pytest.mark.parametrize("model_name", MODELS)
41+
def test_single_prompt_embeds_inference(model_name):
42+
"""Test single prompt inference with prompt embeddings."""
43+
# Prepare prompt embeddings
44+
tokenizer = AutoTokenizer.from_pretrained(model_name)
45+
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
46+
embedding_layer = transformers_model.get_input_embeddings()
47+
48+
chat = [{
49+
"role": "user",
50+
"content": "Please tell me about the capital of France."
51+
}]
52+
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
53+
54+
# Run inference with prompt embeddings
55+
with VllmRunner(
56+
model_name,
57+
enable_prompt_embeds=True,
58+
enforce_eager=True,
59+
) as vllm_runner:
60+
outputs = vllm_runner.model.generate({
61+
"prompt_embeds": prompt_embeds,
62+
})
63+
64+
# Verify output
65+
assert len(outputs) == 1
66+
assert len(outputs[0].outputs) > 0
67+
assert len(outputs[0].outputs[0].text) > 0
68+
print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")
69+
70+
71+
@pytest.mark.parametrize("model_name", MODELS)
72+
def test_batch_prompt_embeds_inference(model_name):
73+
"""Test batch prompt inference with prompt embeddings."""
74+
# Prepare prompt embeddings
75+
tokenizer = AutoTokenizer.from_pretrained(model_name)
76+
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
77+
embedding_layer = transformers_model.get_input_embeddings()
78+
79+
chats = [[{
80+
"role": "user",
81+
"content": "Please tell me about the capital of France."
82+
}],
83+
[{
84+
"role": "user",
85+
"content": "When is the day longest during the year?"
86+
}],
87+
[{
88+
"role": "user",
89+
"content": "Where is bigger, the moon or the sun?"
90+
}]]
91+
92+
prompt_embeds_list = [
93+
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
94+
]
95+
96+
# Run batch inference with prompt embeddings
97+
with VllmRunner(
98+
model_name,
99+
enable_prompt_embeds=True,
100+
enforce_eager=True,
101+
) as vllm_runner:
102+
outputs = vllm_runner.model.generate([{
103+
"prompt_embeds": embeds
104+
} for embeds in prompt_embeds_list])
105+
106+
# Verify outputs
107+
assert len(outputs) == len(chats)
108+
for i, output in enumerate(outputs):
109+
assert len(output.outputs) > 0
110+
assert len(output.outputs[0].text) > 0
111+
print(f"\nQ{i+1}: {chats[i][0]['content']}")
112+
print(f"A{i+1}: {output.outputs[0].text}")
113+
114+
115+
@pytest.mark.parametrize("model_name", MODELS)
116+
def test_prompt_embeds_with_aclgraph(model_name):
117+
"""Test prompt embeddings with ACL graph enabled vs disabled."""
118+
# Prepare prompt embeddings
119+
tokenizer = AutoTokenizer.from_pretrained(model_name)
120+
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
121+
embedding_layer = transformers_model.get_input_embeddings()
122+
123+
chat = [{"role": "user", "content": "What is the capital of China?"}]
124+
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
125+
126+
# Run with ACL graph enabled (enforce_eager=False)
127+
with VllmRunner(
128+
model_name,
129+
enable_prompt_embeds=True,
130+
enforce_eager=False,
131+
) as vllm_aclgraph_runner:
132+
aclgraph_outputs = vllm_aclgraph_runner.model.generate({
133+
"prompt_embeds":
134+
prompt_embeds,
135+
})
136+
137+
# Run with ACL graph disabled (enforce_eager=True)
138+
with VllmRunner(
139+
model_name,
140+
enable_prompt_embeds=True,
141+
enforce_eager=True,
142+
) as vllm_eager_runner:
143+
eager_outputs = vllm_eager_runner.model.generate({
144+
"prompt_embeds":
145+
prompt_embeds,
146+
})
147+
148+
# Verify both produce valid outputs
149+
assert len(aclgraph_outputs) == 1
150+
assert len(eager_outputs) == 1
151+
assert len(aclgraph_outputs[0].outputs[0].text) > 0
152+
assert len(eager_outputs[0].outputs[0].text) > 0
153+
154+
print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
155+
print("[Eager Output]:", eager_outputs[0].outputs[0].text)
156+
157+
# Note: Outputs may differ slightly due to different execution paths,
158+
# but both should be valid responses
159+
160+
161+
@pytest.mark.parametrize("model_name", MODELS)
162+
def test_mixed_prompt_embeds_and_text(model_name):
163+
"""Test mixed inputs with both prompt embeddings and text prompts."""
164+
# Prepare prompt embeddings for first request
165+
tokenizer = AutoTokenizer.from_pretrained(model_name)
166+
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
167+
embedding_layer = transformers_model.get_input_embeddings()
168+
169+
chat = [{"role": "user", "content": "What is AI?"}]
170+
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
171+
172+
# Prepare text prompt for second request
173+
text_prompt = "What is machine learning?"
174+
175+
# Run inference with mixed inputs
176+
with VllmRunner(
177+
model_name,
178+
enable_prompt_embeds=True,
179+
enforce_eager=True,
180+
) as vllm_runner:
181+
# Test prompt embeddings
182+
embeds_output = vllm_runner.model.generate({
183+
"prompt_embeds":
184+
prompt_embeds,
185+
})
186+
187+
# Test text prompt
188+
text_output = vllm_runner.model.generate(text_prompt)
189+
190+
# Verify both types of inputs work
191+
assert len(embeds_output) == 1
192+
assert len(text_output) == 1
193+
assert len(embeds_output[0].outputs[0].text) > 0
194+
assert len(text_output[0].outputs[0].text) > 0
195+
196+
print("\n[Prompt Embeds Output]:", embeds_output[0].outputs[0].text)
197+
print("[Text Prompt Output]:", text_output[0].outputs[0].text)

0 commit comments

Comments
 (0)