vllm-project · gshtras · Oct 22, 2025 · Oct 20, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -454,8 +454,8 @@ steps:
     - pytest -v -s compile/test_fusion_attn.py
     - pytest -v -s compile/test_functionalization.py
     - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_sequence_parallelism.py
-    - pytest -v -s compile/test_async_tp.py
+  #  - pytest -v -s compile/test_sequence_parallelism.py
+  #  - pytest -v -s compile/test_async_tp.py
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
@@ -474,8 +474,8 @@ steps:
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s compile/piecewise/
 
-- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 22min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -485,6 +485,7 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph.py
+  - pytest -v -s compile/test_fusions_e2e.py
 
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
@@ -494,6 +495,7 @@ steps:
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
   commands:
     - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 
@@ -606,7 +608,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 53min
   timeout_in_minutes: 75
@@ -848,6 +850,18 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
@@ -923,8 +937,8 @@ steps:
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
-- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
   gpu: b200
   # optional: true
@@ -937,8 +951,6 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
@@ -955,13 +967,32 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1128,6 +1159,11 @@ steps:
   - pytest -v -s plugins_tests/test_io_processor_plugins.py
   - pip uninstall prithvi_io_processor_plugin -y
   # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
   # other tests continue here:
   - pytest -v -s plugins_tests/test_scheduler_plugins.py
   - pip install -e ./plugins/vllm_add_dummy_model
@@ -1172,7 +1208,6 @@ steps:
     - pytest -v -s -x lora/test_llama_tp.py
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
 
-
 - label: Weight Loading Multiple GPU Test  # 33min
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
@@ -1201,6 +1236,18 @@ steps:
   commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1232,12 +1279,16 @@ steps:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
 ##### H200 test #####
-- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
   gpu: h200
   optional: true
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
+    - pytest -v -s tests/compile/test_async_tp.py
+    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048