Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
7de18d5
[BUG] [ROCm] [MLA] Fix variable name bug due to change in variable na…
tjtanaa May 11, 2025
021c16c
[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)
Isotr0py May 12, 2025
d45fe33
[misc] add instructions on how to install nvshmem/pplx/deepep (#17964)
youkaichao May 12, 2025
08bf784
[Bugfix] validate grammar and throw 400 error instead of crashing the…
Jason-CKY May 12, 2025
ada50aa
[bugfix] fix the wrong parser (#17958)
reidliu41 May 12, 2025
19a3c78
[Bugfix] Fix pydantic.errors.PydanticUserError (#17962)
Potabk May 12, 2025
4307830
[Bugfix][TPU] Use np array when updating cache slot_mapping (#17971)
lsy323 May 12, 2025
891b9d3
[Fix] Benchmark `"EngineClient" has no attribute "model_config"` (#17…
b8zhong May 12, 2025
3a5ea75
[Feature] Support DeepSeekV3 Function Call (#17784)
Xu-Wenqing May 12, 2025
9fbf2bf
Correcting testcases in builkite job for IBM Power (#17675)
AaruniAggarwal May 12, 2025
a658de3
tp_size in metadata and handshake with rank0 first
NickLucche May 6, 2025
8db3605
split kv_cache along head dim
NickLucche May 10, 2025
7ea6cb2
[Misc] Improve modelscope import error (#17983)
jeejeelee May 12, 2025
05a4324
Initialize the delta tool call fields explicitly (#17340)
maxdebayser May 12, 2025
d191102
[P/D] NIXL Integration (#17751)
robertgshaw2-redhat May 12, 2025
98ea356
[Lora][Frontend]Add default local directory LoRA resolver plugin. (#1…
jberkhahn May 12, 2025
72a3f6b
Construct `KVTransferConfig` properly from Python instead of using JS…
hmellor May 12, 2025
b9fd0d7
[CI/Build] Fix TPU V1 Test mixed use of & and && across tests (#17968)
CAROLZXYZXY May 12, 2025
289199f
[Core] Use platform-agnostic device control for DP engine core (#17245)
jianzs May 12, 2025
e9c730c
Enabling "Weight Loading Multiple GPU Test - Large Models" (#18020)
Alexei-V-Ivanov-AMD May 12, 2025
302f3ac
[v1][KVCacheManager] Change prefix caching metric from counting block…
heheda12345 May 12, 2025
195adb4
[Chore] Remove unused method (#18024)
robertgshaw2-redhat May 12, 2025
2b0db9b
Enable standard language model for torhc nightly (#18004)
yangw-dev May 12, 2025
ebab1ac
[CI] Make JSON output tests less likely to fail (#17859)
russellb May 12, 2025
c869e75
updated
robertgshaw2-redhat May 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,12 @@ function cpu_tests() {
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
}

# All of CPU tests are expected to be finished less than 40 mins.
Expand Down
42 changes: 21 additions & 21 deletions .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,27 @@ docker run --privileged --net host --shm-size=16G -it \
&& tpu-info \
&& { \
echo TEST_0: Running test_perf.py; \
pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
echo TEST_0_EXIT_CODE: \$?; \
} & \
&& { \
{ \
echo TEST_1: Running test_compilation.py; \
pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
echo TEST_1_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_2: Running test_basic.py; \
pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
echo TEST_2_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
echo TEST_3_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_4: Running test_quantization_accuracy.py; \
pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
echo TEST_4_EXIT_CODE: \$?; \
} & \
{ \
Expand All @@ -56,43 +56,43 @@ docker run --privileged --net host --shm-size=16G -it \
} & \
{ \
echo TEST_6: Running test_tpu_model_runner.py; \
pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
echo TEST_6_EXIT_CODE: \$?; \
} & \
&& { \
{ \
echo TEST_7: Running test_sampler.py; \
pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \
} & \
&& { \
{ \
echo TEST_8: Running test_topk_topp_sampler.py; \
pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
echo TEST_8_EXIT_CODE: \$?; \
} & \
&& { \
{ \
echo TEST_9: Running test_multimodal.py; \
pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
echo TEST_9_EXIT_CODE: \$?; \
} & \
&& { \
{ \
echo TEST_10: Running test_pallas.py; \
pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
echo TEST_10_EXIT_CODE: \$?; \
} & \
&& { \
{ \
echo TEST_11: Running test_struct_output_generate.py; \
pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
echo TEST_11_EXIT_CODE: \$?; \
} & \
&& { \
{ \
echo TEST_12: Running test_moe_pallas.py; \
pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
echo TEST_12_EXIT_CODE: \$?; \
} & \
# Disable the TPU LoRA tests until the feature is activated
# && { \
# & { \
# echo TEST_13: Running test_moe_pallas.py; \
# pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# echo TEST_13_EXIT_CODE: \$?; \
# } & \
wait \
Expand Down
8 changes: 7 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -472,12 +472,14 @@ steps:

- label: Language Models Test (Standard)
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/[email protected]'
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model

- label: Language Models Test (Extended)
Expand All @@ -493,11 +495,13 @@ steps:

- label: Multi-Modal Models Test (Standard)
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal/processing
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
Expand Down Expand Up @@ -628,7 +632,7 @@ steps:
- vllm/plugins/
- tests/plugins/
commands:
# begin platform plugin tests, all the code in-between runs on dummy platform
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
Expand All @@ -639,6 +643,7 @@ steps:
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

- label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental]
Expand Down Expand Up @@ -702,6 +707,7 @@ steps:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

- label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
Expand Down
7 changes: 4 additions & 3 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,10 @@ async def run_vllm_async(

async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
model_config = await llm.get_model_config()
assert all(
llm.model_config.max_model_len >= (request.prompt_len +
request.expected_output_len)
model_config.max_model_len >= (request.prompt_len +
request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
Expand Down Expand Up @@ -599,7 +600,7 @@ def validate_args(args):
"--lora-path",
type=str,
default=None,
help="Path to the lora adapters to use. This can be an absolute path, "
help="Path to the LoRA adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.")
parser.add_argument(
"--prefix-len",
Expand Down
7 changes: 5 additions & 2 deletions docs/source/features/lora.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,12 @@ Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adap

You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.

You can either install existing plugins or implement your own.
You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
that adapter will then be available for normal use on the server.

Steps to implement your own LoRAResolver plugin:
Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface.

Example of a simple S3 LoRAResolver implementation:
Expand Down
7 changes: 7 additions & 0 deletions docs/source/features/tool_calling.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,13 @@ For Qwen2.5, the chat template in tokenizer_config.json has already included sup

Flags: `--tool-call-parser hermes`

### DeepSeek-V3 Models (`deepseek_v3`)

Supported models:
* `deepseek-ai/DeepSeek-V3-0324`

Flags: `--tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja`

### Models with Pythonic Tool Calls (`pythonic`)

A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
Expand Down
6 changes: 3 additions & 3 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -1045,10 +1045,10 @@ Specified using `--task generate`.
*
* ✅︎
* ✅︎
- * `Ovis2ForConditionalGeneration`<sup>^</sup>
* Ovis2
- * `Ovis`
* Ovis2, Ovis1.6
* T + I<sup>+</sup>
* `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis2-2B`, etc.
* `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.
*
*
* ✅︎
Expand Down
14 changes: 8 additions & 6 deletions examples/lmcache/disagg_prefill_lmcache_v0.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ def run_prefill(prefill_done, prompts):

sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
)
ktc = KVTransferConfig(kv_connector="LMCacheConnector",
kv_role="kv_producer",
kv_rank=0,
kv_parallel_size=2)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
Expand All @@ -78,9 +79,10 @@ def run_decode(prefill_done, prompts, timeout=1):

sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
)
ktc = KVTransferConfig(kv_connector="LMCacheConnector",
kv_role="kv_consumer",
kv_rank=1,
kv_parallel_size=2)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
Expand Down
8 changes: 4 additions & 4 deletions examples/lmcache/kv_cache_sharing_lmcache_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def run_store(store_done, prompts):

sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
kv_role="kv_both")
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
Expand All @@ -76,8 +76,8 @@ def run_retrieve(store_done, prompts, timeout=1):

sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
kv_role="kv_both")
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@

sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
gpu_memory_utilization=0.8,
max_num_batched_tokens=64,
max_num_seqs=16,
kv_transfer_config=KVTransferConfig.from_cli(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
'"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
)) #, max_model_len=2048, max_num_batched_tokens=2048)
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
gpu_memory_utilization=0.8,
max_num_batched_tokens=64,
max_num_seqs=16,
kv_transfer_config=KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_role="kv_both",
kv_connector_extra_config={
"shared_storage_path": "local_storage"
})) #, max_model_len=2048, max_num_batched_tokens=2048)

# 1ST generation (prefill instance)
outputs = llm.generate(prompts, sampling_params)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
gpu_memory_utilization=0.8,
kv_transfer_config=KVTransferConfig.from_cli(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
'"kv_connector_extra_config": '
'{"shared_storage_path": "local_storage"}}')
) #, max_model_len=2048, max_num_batched_tokens=2048)
kv_transfer_config=KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_role="kv_both",
kv_connector_extra_config={
"shared_storage_path": "local_storage"
})) #, max_model_len=2048, max_num_batched_tokens=2048)

# 1ST generation (prefill instance)
outputs = llm.generate(
Expand Down
14 changes: 8 additions & 6 deletions examples/offline_inference/disaggregated_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ def run_prefill(prefill_done):
# This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
)
ktc = KVTransferConfig(kv_connector="PyNcclConnector",
kv_role="kv_producer",
kv_rank=0,
kv_parallel_size=2)

# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
Expand Down Expand Up @@ -71,9 +72,10 @@ def run_decode(prefill_done):
# This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
)
ktc = KVTransferConfig(kv_connector="PyNcclConnector",
kv_role="kv_consumer",
kv_rank=1,
kv_parallel_size=2)

# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
Expand Down
21 changes: 12 additions & 9 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,8 +725,8 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
)


# Ovis2
def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
# Ovis
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"

model_name = "AIDC-AI/Ovis2-1B"
Expand All @@ -737,15 +737,18 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
limit_mm_per_prompt={modality: 1},
)

placeholder = "<image>\n"
prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n") for question in questions]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [[{
'role': 'user',
'content': f"<image>\n{question}"
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)

return ModelRequestData(
engine_args=engine_args,
Expand Down Expand Up @@ -1069,7 +1072,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
"llama4": run_llama4,
"molmo": run_molmo,
"NVLM_D": run_nvlm_d,
"ovis2": run_ovis2,
"ovis": run_ovis,
"paligemma": run_paligemma,
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
Expand Down
Loading