vllm-project · myselvess · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -4,8 +4,7 @@ set -xu
 
 
 remove_docker_container() { 
-    docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
+    docker rm -f tpu-test || true;
 }
 
 trap remove_docker_container EXIT

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -5,7 +5,6 @@ set -xu
 
 remove_docker_container() { 
     docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
 }
 
 trap remove_docker_container EXIT

diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env
@@ -1,6 +1,6 @@
 # Environment config
 TEST_NAME=llama8b
-CONTAINER_NAME=vllm-tpu
+CONTAINER_NAME=tpu-test
 
 # vllm config
 MODEL=meta-llama/Llama-3.1-8B-Instruct

diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -12,8 +12,6 @@ source /etc/environment
 source $ENV_FILE
 
 remove_docker_container() { 
-    docker rm -f tpu-test || true; 
-    docker rm -f vllm-tpu || true;
     docker rm -f $CONTAINER_NAME || true;
 }
 

diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -1,6 +1,6 @@
 # Environment config
 TEST_NAME=llama8bw8a8
-CONTAINER_NAME=vllm-tpu
+CONTAINER_NAME=tpu-test
 
 # vllm config
 MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -664,7 +664,7 @@ steps:
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
-    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
@@ -749,7 +749,6 @@ steps:
   # this test fails consistently.
   # TODO: investigate and fix
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s models/multimodal/generation/test_maverick.py
 

diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
@@ -49,6 +49,7 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
+best_request_rate=0
 
 start_server() {
     local gpu_memory_utilization=$1
@@ -57,18 +58,35 @@ start_server() {
     local vllm_log=$4
     local profile_dir=$5
 
-    pkill -f vllm
+    pkill -if vllm
 
-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
-        --port 8004 \
-        --gpu-memory-utilization $gpu_memory_utilization \
-        --max-num-seqs $max_num_seqs \
-        --max-num-batched-tokens $max_num_batched_tokens \
-        --tensor-parallel-size $TP \
-        --enable-prefix-caching \
-        --load-format dummy \
-        --download-dir "$DOWNLOAD_DIR" \
-        --max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
+    # Define the common arguments as a bash array.
+    # Each argument and its value are separate elements.
+    local common_args_array=(
+        "$MODEL"
+        "--disable-log-requests"
+        "--port" "8004"
+        "--gpu-memory-utilization" "$gpu_memory_utilization"
+        "--max-num-seqs" "$max_num_seqs"
+        "--max-num-batched-tokens" "$max_num_batched_tokens"
+        "--tensor-parallel-size" "$TP"
+        "--enable-prefix-caching"
+        "--load-format" "dummy"
+        "--download-dir" "$DOWNLOAD_DIR"
+        "--max-model-len" "$MAX_MODEL_LEN"
+    )
+
+    # Use the array expansion "${common_args_array[@]}"
+    # This correctly passes each element as a separate argument.
+    if [[ -n "$profile_dir" ]]; then
+        # Start server with profiling enabled
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+    else
+        # Start server without profiling
+        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
+            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+    fi
 
     # wait for 10 minutes...
     server_started=0
@@ -82,6 +100,7 @@ start_server() {
             sleep 10
         fi
     done
+
     if (( ! server_started )); then
         echo "server did not start within 10 minutes. Please check server log at $vllm_log".
         return 1
@@ -90,37 +109,20 @@ start_server() {
     fi
 }
 
-update_best_profile() {
-    local profile_dir=$1
-    local profile_index=$2
-    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
-    selected_profile_file=
-    if [[ "$SYSTEM" == "TPU" ]]; then
-        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
-    fi
-    if [[ "$SYSTEM" == "GPU" ]]; then
-        selected_profile_file="${sorted_paths[$profile_index]}"
-    fi
-    rm -f $PROFILE_PATH/*
-    cp $selected_profile_file $PROFILE_PATH
-}
-
 run_benchmark() {
     local max_num_seqs=$1
     local max_num_batched_tokens=$2
     local gpu_memory_utilization=$3
     echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
     local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
-    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
     echo "vllm_log: $vllm_log"
     echo
     rm -f $vllm_log
-    mkdir -p $profile_dir
-    pkill -f vllm
-    local profile_index=0
+    pkill -if vllm
 
     echo "starting server..."
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
+    # Call start_server without a profile_dir to avoid profiling overhead
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
     result=$?
     if [[ "$result" -eq 1 ]]; then
         echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -134,7 +136,8 @@ run_benchmark() {
     # get a basic qps by using request-rate inf
     bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
     prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
-adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    # --profile flag is removed from this call
     vllm bench serve \
         --backend vllm \
         --model $MODEL  \
@@ -148,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 1000 \
         --random-prefix-len $prefix_len \
-        --port 8004 \
-        --profile &> "$bm_log"
+        --port 8004 &> "$bm_log"
     throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
     goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@@ -163,7 +165,6 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
     # start from request-rate as int(throughput) + 1
         request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
-            profile_index=$((profile_index+1))
             # clear prefix cache
             curl -X POST http://0.0.0.0:8004/reset_prefix_cache
             sleep 5
@@ -201,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
             best_max_num_seqs=$max_num_seqs
             best_num_batched_tokens=$max_num_batched_tokens
             best_goodput=$goodput
-            if [[ "$SYSTEM" == "TPU" ]]; then
-                update_best_profile "$profile_dir/plugins/profile" $profile_index
-            fi
-            if [[ "$SYSTEM" == "GPU" ]]; then
-                update_best_profile "$profile_dir" $profile_index
-            fi
+            best_request_rate=$request_rate
         fi
     else
         echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@@ -215,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
 
     echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 
-    pkill vllm
+    pkill -if vllm
     sleep 10
     printf '=%.0s' $(seq 1 20)
     return 0
@@ -228,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
 gpu_memory_utilization=0.98
 find_gpu_memory_utilization=0
 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
-    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    # Pass empty string for profile_dir argument
+    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
     result=$?
     if [[ "$result" -eq 0 ]]; then
         find_gpu_memory_utilization=1
@@ -251,5 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do
     done
 done
 echo "finish permutations"
+
+# =================================================================================
+# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
+# =================================================================================
+if (( $(echo "$best_throughput > 0" | bc -l) )); then
+    echo
+    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
+    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
+    echo
+
+    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
+    bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
+
+    # Start server with the best params and profiling ENABLED
+    echo "Starting server for profiling..."
+    start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
+
+    # Run benchmark with the best params and the --profile flag
+    echo "Running benchmark with profiling..."
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    vllm bench serve \
+        --backend vllm \
+        --model $MODEL \
+        --dataset-name random \
+        --random-input-len $adjusted_input_len \
+        --random-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate $best_request_rate \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 100 \
+        --random-prefix-len $prefix_len \
+        --port 8004 \
+        --profile &> "$bm_log"
+else
+    echo "No configuration met the latency requirements. Skipping final profiling run."
+fi
+pkill -if vllm
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
diff --git a/...rks/kernels/benchmark_trtllm_attention.py → ...nels/benchmark_trtllm_decode_attention.py b/...rks/kernels/benchmark_trtllm_attention.py → ...nels/benchmark_trtllm_decode_attention.py
@@ -41,7 +41,6 @@ def benchmark_decode(
     device = "cuda"
     torch.manual_seed(0)
 
-    # Currently only HEAD_GRP_SIZE == 8 is supported
     HEAD_GRP_SIZE = 8
     MAX_SEQ_LEN = max_seq_len