-
Notifications
You must be signed in to change notification settings - Fork 78
Description
用的是这里这个例子,因为vllm server不能访问外网,用本地图片代替
而且条件比较特殊,也不能本地加载模型
image_url = "cat_snow.jpg"
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
text = "A cat standing in the snow."
def encode_image(path: str) -> str:
with open(path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def create_chat_embeddings(
client: OpenAI,
*,
messages: list[ChatCompletionMessageParam],
model: str,
encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
continue_final_message: bool = True,
add_special_tokens: bool = True,
) -> CreateEmbeddingResponse:
"""
Convenience function for accessing vLLM's Chat Embeddings API,
which is an extension of OpenAI's existing Embeddings API.
"""
return client.post(
"/embeddings",
cast_to=CreateEmbeddingResponse,
body={
"messages": messages,
"model": model,
"encoding_format": encoding_format,
"continue_final_message": continue_final_message,
"add_special_tokens": add_special_tokens,
},
)
def print_embeddings(embeds):
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
def run_qwen3_vl(client: OpenAI, model: str):
"""
Start the server using:
vllm serve Qwen/Qwen3-VL-Embedding-2B \
--runner pooling \
--max-model-len 8192
"""
default_instruction = "Represent the user's input."
print("Text embedding output:")
response = create_chat_embeddings(
client,
messages=[
{
"role": "system",
"content": [
{"type": "text", "text": default_instruction},
],
},
{
"role": "user",
"content": [
{"type": "text", "text": text},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": ""},
],
},
],
model=model,
encoding_format="float",
continue_final_message=True,
add_special_tokens=True,
)
text_embedding = response.data[0].embedding
print_embeddings(response.data[0].embedding)
print("Image embedding output:")
response = create_chat_embeddings(
client,
messages=[
{
"role": "system",
"content": [
{"type": "text", "text": default_instruction},
],
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_url)}"}},
{"type": "text", "text": ""},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": ""},
],
},
],
model=model,
encoding_format="float",
continue_final_message=True,
add_special_tokens=True,
)
image_embedding = response.data[0].embedding
print_embeddings(response.data[0].embedding)
print("Image+Text embedding output:")
response = create_chat_embeddings(
client,
messages=[
{
"role": "system",
"content": [
{"type": "text", "text": default_instruction},
],
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_url)}"}},
{
"type": "text",
"text": f"{text}",
},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": ""},
],
},
],
model=model,
encoding_format="float",
continue_final_message=True,
add_special_tokens=True,
)
image_text_embedding = response.data[0].embedding
print_embeddings(response.data[0].embedding)
print(calculate_cosine_similarity(text_embedding, image_text_embedding))
print(calculate_cosine_similarity(text_embedding, image_embedding))
print(calculate_cosine_similarity(image_text_embedding, image_embedding))
最终得到的三个结果
0.7227959439983467
0.5680971196302782
0.7123613733469794
例子里面 text_embedding和image_embedding的相似度只有0.56?
我试了很多个图片/文字的例子用来计算余弦相似度,但是发现这么搭建的话,用什么例子余弦相似度都没有超过0.9的
公司条件比较特殊,没办法本地运行模型,只能通过API访问+docker启动
docker的vllm镜像是最新的0.15
这是我的启动命令
podman run -d --name vllm-qwen3-vl-embedding-8b-VLLM015 -e VLLM_API_KEY=my_token --security-opt=label=disable --ipc=host --network=host --gpus all -e CUDA_VISIBLE_DEVICES=6 -v /home/qwen3_vl_embedding_8B:/models/Qwen3_vl_embedding_8B:ro vllm/vllm-openai:latest --model /models/Qwen3_vl_embedding_8B --port 8002 --host 0.0.0.0 --tensor-parallel-size 1 --dtype bfloat16 --kv-cache-dtype fp8_e4m3 --max-model-len 32768 --enable-prefix-caching --enable-chunked-prefill --runner=pooling --max-num-batched-tokens 131072 --gpu-memory-utilization 0.96 --swap-space 8 --served-model-name qwen3_vl_embedding_8b