robertgshaw2-redhat · NickLucche · May 11, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -32,9 +32,12 @@ function cpu_tests() {
     set -e
     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
     pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
+    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -26,27 +26,27 @@ docker run --privileged --net host --shm-size=16G -it \
     && tpu-info \
     && { \
         echo TEST_0: Running test_perf.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
         echo TEST_0_EXIT_CODE: \$?; \
     } & \
-    && { \
+    { \
         echo TEST_1: Running test_compilation.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
         echo TEST_1_EXIT_CODE: \$?; \
     } & \
     { \
         echo TEST_2: Running test_basic.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
         echo TEST_2_EXIT_CODE: \$?; \
     } & \
     { \
         echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
-        pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
+        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
         echo TEST_3_EXIT_CODE: \$?; \
     } & \
     { \
         echo TEST_4: Running test_quantization_accuracy.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
         echo TEST_4_EXIT_CODE: \$?; \
     } & \
     { \
@@ -56,43 +56,43 @@ docker run --privileged --net host --shm-size=16G -it \
     } & \
     { \
         echo TEST_6: Running test_tpu_model_runner.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
         echo TEST_6_EXIT_CODE: \$?; \
     } & \
-    && { \
+    { \
         echo TEST_7: Running test_sampler.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
         echo TEST_7_EXIT_CODE: \$?; \
     } & \
-    && { \
+    { \
         echo TEST_8: Running test_topk_topp_sampler.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
         echo TEST_8_EXIT_CODE: \$?; \
     } & \
-    && { \
+    { \
         echo TEST_9: Running test_multimodal.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
         echo TEST_9_EXIT_CODE: \$?; \
     } & \
-    && { \
+    { \
         echo TEST_10: Running test_pallas.py; \
-        pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
         echo TEST_10_EXIT_CODE: \$?; \
     } & \
-    && { \
+    { \
         echo TEST_11: Running test_struct_output_generate.py; \
-        pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
         echo TEST_11_EXIT_CODE: \$?; \
     } & \
-    && { \
+    { \
         echo TEST_12: Running test_moe_pallas.py; \
-        pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
+        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
         echo TEST_12_EXIT_CODE: \$?; \
     } & \
     # Disable the TPU LoRA tests until the feature is activated
-    # && { \
+    # & { \
     #     echo TEST_13: Running test_moe_pallas.py; \
-    #     pytest -s -v /workspace/vllm/tests/tpu/lora/; \
+    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
     #     echo TEST_13_EXIT_CODE: \$?; \
     # } & \
     wait \

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -472,12 +472,14 @@ steps:
 
 - label: Language Models Test (Standard)
   mirror_hardwares: [amdexperimental]
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/language
   commands:
     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
     - pip install 'git+https://github.com/Dao-AILab/[email protected]'
+    - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m core_model
 
 - label: Language Models Test (Extended)
@@ -493,11 +495,13 @@ steps:
 
 - label: Multi-Modal Models Test (Standard)
   mirror_hardwares: [amdexperimental]
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
     - pytest -v -s models/multimodal/processing
     - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
     - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
@@ -628,7 +632,7 @@ steps:
   - vllm/plugins/
   - tests/plugins/
   commands:
-  # begin platform plugin tests, all the code in-between runs on dummy platform
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
   - pip install -e ./plugins/vllm_add_dummy_platform
   - pytest -v -s plugins_tests/test_platform_plugins.py
   - pip uninstall vllm_add_dummy_platform -y
@@ -639,6 +643,7 @@ steps:
   - pytest -v -s distributed/test_distributed_oot.py
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
 - label: Multi-step Tests (4 GPUs) # 36min
   mirror_hardwares: [amdexperimental]
@@ -702,6 +707,7 @@ steps:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 
 - label: Weight Loading Multiple GPU Test - Large Models # optional
+  mirror_hardwares: [amdexperimental] 
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   gpu: a100

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -146,9 +146,10 @@ async def run_vllm_async(
 
     async with build_async_engine_client_from_engine_args(
             engine_args, disable_frontend_multiprocessing) as llm:
+        model_config = await llm.get_model_config()
         assert all(
-            llm.model_config.max_model_len >= (request.prompt_len +
-                                               request.expected_output_len)
+            model_config.max_model_len >= (request.prompt_len +
+                                           request.expected_output_len)
             for request in requests), (
                 "Please ensure that max_model_len is greater than the sum of"
                 " prompt_len and expected_output_len for all requests.")
@@ -599,7 +600,7 @@ def validate_args(args):
         "--lora-path",
         type=str,
         default=None,
-        help="Path to the lora adapters to use. This can be an absolute path, "
+        help="Path to the LoRA adapters to use. This can be an absolute path, "
         "a relative path, or a Hugging Face model identifier.")
     parser.add_argument(
         "--prefix-len",

diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
@@ -159,9 +159,12 @@ Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adap
 
 You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
 
-You can either install existing plugins or implement your own.
+You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
+To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
+it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
+that adapter will then be available for normal use on the server.
 
-Steps to implement your own LoRAResolver plugin:
+Alternatively, follow these example steps to implement your own plugin:
 1. Implement the LoRAResolver interface.
 
     Example of a simple S3 LoRAResolver implementation:

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
@@ -236,6 +236,13 @@ For Qwen2.5, the chat template in tokenizer_config.json has already included sup
 
 Flags: `--tool-call-parser hermes`
 
+### DeepSeek-V3 Models (`deepseek_v3`)
+
+Supported models:
+* `deepseek-ai/DeepSeek-V3-0324`
+
+Flags: `--tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -1045,10 +1045,10 @@ Specified using `--task generate`.
   *
   * ✅︎
   * ✅︎
-- * `Ovis2ForConditionalGeneration`<sup>^</sup>
-  * Ovis2
+- * `Ovis`
+  * Ovis2, Ovis1.6
   * T + I<sup>+</sup>
-  * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis2-2B`, etc.
+  * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.
   *
   *
   * ✅︎

diff --git a/examples/lmcache/disagg_prefill_lmcache_v0.py b/examples/lmcache/disagg_prefill_lmcache_v0.py
@@ -49,9 +49,10 @@ def run_prefill(prefill_done, prompts):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
 
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
-    )
+    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
+                           kv_role="kv_producer",
+                           kv_rank=0,
+                           kv_parallel_size=2)
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
     llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -78,9 +79,10 @@ def run_decode(prefill_done, prompts, timeout=1):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
-    )
+    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
+                           kv_role="kv_consumer",
+                           kv_rank=1,
+                           kv_parallel_size=2)
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # of memory. Reduce the value if your GPU has less memory.
     llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",

diff --git a/examples/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -49,8 +49,8 @@ def run_store(store_done, prompts):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
+                           kv_role="kv_both")
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
     llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -76,8 +76,8 @@ def run_retrieve(store_done, prompts, timeout=1):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
+                           kv_role="kv_both")
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # of memory. Reduce the value if your GPU has less memory.
     llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",

diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -16,16 +16,17 @@
 
 sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-llm = LLM(
-    model="meta-llama/Llama-3.2-1B-Instruct",
-    enforce_eager=True,
-    gpu_memory_utilization=0.8,
-    max_num_batched_tokens=64,
-    max_num_seqs=16,
-    kv_transfer_config=KVTransferConfig.from_cli(
-        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
-        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
-    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+          enforce_eager=True,
+          gpu_memory_utilization=0.8,
+          max_num_batched_tokens=64,
+          max_num_seqs=16,
+          kv_transfer_config=KVTransferConfig(
+              kv_connector="SharedStorageConnector",
+              kv_role="kv_both",
+              kv_connector_extra_config={
+                  "shared_storage_path": "local_storage"
+              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
 
 # 1ST generation (prefill instance)
 outputs = llm.generate(prompts, sampling_params)

diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -17,11 +17,12 @@
 llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
           enforce_eager=True,
           gpu_memory_utilization=0.8,
-          kv_transfer_config=KVTransferConfig.from_cli(
-              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
-              '"kv_connector_extra_config": '
-              '{"shared_storage_path": "local_storage"}}')
-          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+          kv_transfer_config=KVTransferConfig(
+              kv_connector="SharedStorageConnector",
+              kv_role="kv_both",
+              kv_connector_extra_config={
+                  "shared_storage_path": "local_storage"
+              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
 
 # 1ST generation (prefill instance)
 outputs = llm.generate(

diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
@@ -32,9 +32,10 @@ def run_prefill(prefill_done):
     # This instance is the prefill node (kv_producer, rank 0).
     # The number of parallel instances for KV cache transfer is set to 2,
     # as required for PyNcclConnector.
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
-    )
+    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
+                           kv_role="kv_producer",
+                           kv_rank=0,
+                           kv_parallel_size=2)
 
     # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
     # memory. You may need to adjust the value to fit your GPU.
@@ -71,9 +72,10 @@ def run_decode(prefill_done):
     # This instance is the decode node (kv_consumer, rank 1).
     # The number of parallel instances for KV cache transfer is set to 2,
     # as required for PyNcclConnector.
-    ktc = KVTransferConfig.from_cli(
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
-    )
+    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
+                           kv_role="kv_consumer",
+                           kv_rank=1,
+                           kv_parallel_size=2)
 
     # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
     # memory. You may need to adjust the value to fit your GPU.

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
@@ -725,8 +725,8 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
-# Ovis2
-def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
+# Ovis
+def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "AIDC-AI/Ovis2-1B"
@@ -737,15 +737,18 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         trust_remote_code=True,
         dtype="half",
-        hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
         limit_mm_per_prompt={modality: 1},
     )
 
-    placeholder = "<image>\n"
-    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-                f"<|im_start|>user\n{placeholder}"
-                f"{question}<|im_end|>\n"
-                "<|im_start|>assistant\n") for question in questions]
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -1069,7 +1072,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     "llama4": run_llama4,
     "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
-    "ovis2": run_ovis2,
+    "ovis": run_ovis,
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,