Skip to content

Commit 194eaf1

Browse files
committed
Add comprehensive stress testing script for e2e tests
- Implements multi-threaded stress testing with configurable parameters - Supports different workload patterns (sustained, spike, gradual) - Includes comprehensive metrics collection and reporting - Provides cleanup mechanisms and proper error handling - Enables scalability testing for production environments Signed-off-by: Kobe Chen <[email protected]>
1 parent 7324b96 commit 194eaf1

File tree

1 file changed

+371
-0
lines changed

1 file changed

+371
-0
lines changed

tests/e2e/stress-test.sh

Lines changed: 371 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
#!/bin/bash
2+
3+
# DESCRIPTION:
4+
# This script performs comprehensive stress testing of the VLLM router's
5+
# round-robin routing logic under high concurrent loads. It also validates that
6+
# requests are evenly distributed across multiple backend servers.
7+
8+
# USAGE:
9+
# pip install -e .
10+
# bash tests/e2e/stress-test.sh
11+
12+
# OUTPUT EXAMPLE:
13+
# bash tests/e2e/stress-test.sh
14+
# [INFO] Checking prerequisites...
15+
# [INFO] Router stress test configuration:
16+
# [INFO] Concurrent requests: 2000
17+
# [INFO] Total requests: 10000
18+
# [INFO] Router port: 30080
19+
# [INFO] Backend ports: 8001, 8002
20+
# [INFO] Model: facebook/opt-125m
21+
# [INFO] Starting router with round-robin routing (stress test mode)
22+
# [INFO] Router started with PID: 1307668
23+
# [INFO] Waiting for router to be ready...
24+
# [INFO] Router is ready
25+
# [INFO] Running stress test with Apache Bench
26+
# [INFO] Concurrent: 2000, Total: 10000
27+
# This is ApacheBench, Version 2.3 <$Revision: 1879490 $>
28+
# Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
29+
# Licensed to The Apache Software Foundation, http://www.apache.org/
30+
31+
# Benchmarking localhost (be patient)
32+
# Completed 1000 requests
33+
# Completed 2000 requests
34+
# Completed 3000 requests
35+
# Completed 4000 requests
36+
# Completed 5000 requests
37+
# Completed 6000 requests
38+
# Completed 7000 requests
39+
# Completed 8000 requests
40+
# Completed 9000 requests
41+
# Completed 10000 requests
42+
# Finished 10000 requests
43+
44+
45+
# Server Software: uvicorn
46+
# Server Hostname: localhost
47+
# Server Port: 30080
48+
49+
# Document Path: /v1/chat/completions
50+
# Document Length: 21 bytes
51+
52+
# Concurrency Level: 2000
53+
# Time taken for tests: 54.648 seconds
54+
# Complete requests: 10000
55+
# Failed requests: 0
56+
# Non-2xx responses: 10000
57+
# Total transferred: 1930000 bytes
58+
# Total body sent: 3920000
59+
# HTML transferred: 210000 bytes
60+
# Requests per second: 182.99 [#/sec] (mean)
61+
# Time per request: 10929.546 [ms] (mean)
62+
# Time per request: 5.465 [ms] (mean, across all concurrent requests)
63+
# Transfer rate: 34.49 [Kbytes/sec] received
64+
# 70.05 kb/s sent
65+
# 104.54 kb/s total
66+
67+
# Connection Times (ms)
68+
# min mean[+/-sd] median max
69+
# Connect: 0 14 18.0 4 63
70+
# Processing: 118 9322 3654.3 8204 18354
71+
# Waiting: 25 8933 3648.5 7785 17623
72+
# Total: 118 9336 3646.5 8239 18357
73+
74+
# Percentage of the requests served within a certain time (ms)
75+
# 50% 8239
76+
# 66% 9501
77+
# 75% 10511
78+
# 80% 11791
79+
# 90% 16048
80+
# 95% 16759
81+
# 98% 17191
82+
# 99% 17494
83+
# 100% 18357 (longest request)
84+
# [INFO] Stress test completed
85+
# [INFO] Checking round-robin routing correctness...
86+
# [INFO] Round-robin routing results:
87+
# [INFO] Backend localhost:8001: 5000 requests
88+
# [INFO] Backend localhost:8002: 5000 requests
89+
# [INFO] Total routed: 10000 requests
90+
# [INFO] Backend localhost:8001: 50%
91+
# [INFO] Backend localhost:8002: 50%
92+
# [INFO] ✅ Round-robin routing is working correctly (0% difference)
93+
# [INFO] Test completed successfully!
94+
# [INFO] Cleaning up router processes...
95+
96+
97+
set -euo pipefail
98+
99+
# Default values
100+
ROUTER_PORT=30080
101+
CONCURRENT=2000
102+
REQUESTS=10000
103+
LOG_DIR="/tmp/router-stress-logs"
104+
MODEL="facebook/opt-125m"
105+
BACKEND1_PORT=8001
106+
BACKEND2_PORT=8002
107+
BACKENDS_URL="http://localhost:$BACKEND1_PORT,http://localhost:$BACKEND2_PORT"
108+
109+
# Colors for output
110+
GREEN='\033[0;32m'
111+
RED='\033[0;31m'
112+
YELLOW='\033[1;33m'
113+
NC='\033[0m'
114+
115+
print_status() {
116+
echo -e "${GREEN}[INFO]${NC} $1"
117+
}
118+
119+
print_error() {
120+
echo -e "${RED}[ERROR]${NC} $1"
121+
}
122+
123+
print_warning() {
124+
echo -e "${YELLOW}[WARNING]${NC} $1"
125+
}
126+
127+
show_usage() {
128+
cat << EOF
129+
Router Stress Test - Tests round-robin routing logic
130+
131+
Usage: $0 [options]
132+
133+
Options:
134+
-c, --concurrent N Concurrent requests (default: 2000)
135+
-n, --requests N Total requests (default: 10000)
136+
-p, --port PORT Router port (default: 30080)
137+
-l, --log-dir DIR Log directory (default: /tmp/router-stress-logs)
138+
-m, --model MODEL Model to use (default: facebook/opt-125m)
139+
--backend1-port PORT First backend port (default: 8000)
140+
--backend2-port PORT Second backend port (default: 8001)
141+
-h, --help Show this help
142+
143+
Examples:
144+
$0 # Basic test (2000 concurrent, 10000 requests)
145+
$0 -c 500 -n 20000 # High load test
146+
$0 -p 8080 -c 100 # Different port, lower load
147+
$0 --backend1-port 9000 --backend2-port 9001 # Custom backend ports
148+
149+
Prerequisites:
150+
- Router must be started with VLLM_ROUTER_STRESS_TEST_MODE=true
151+
EOF
152+
}
153+
154+
# Check if Apache Bench is available
155+
check_ab() {
156+
if ! command -v ab >/dev/null 2>&1; then
157+
print_error "Apache Bench (ab) not found!"
158+
print_error "Install with: sudo apt-get install apache2-utils"
159+
exit 1
160+
fi
161+
}
162+
163+
# Function to cleanup processes
164+
cleanup() {
165+
print_status "Cleaning up router processes..."
166+
pkill -f "python3 -m src.vllm_router.app" || true
167+
sleep 2
168+
}
169+
170+
# Function to start router
171+
start_router() {
172+
local log_file="$LOG_DIR/router.log"
173+
174+
print_status "Starting router with round-robin routing (stress test mode)"
175+
176+
# Create log directory
177+
mkdir -p "$(dirname "$log_file")"
178+
179+
# Set stress test mode
180+
export VLLM_ROUTER_STRESS_TEST_MODE=true
181+
182+
# Start router with detailed logging
183+
python3 -m src.vllm_router.app --port "$ROUTER_PORT" \
184+
--service-discovery static \
185+
--static-backends "$BACKENDS_URL" \
186+
--static-models "$MODEL,$MODEL" \
187+
--static-model-types "chat,chat" \
188+
--routing-logic roundrobin \
189+
--log-stats \
190+
--log-stats-interval 5 > "$log_file" 2>&1 &
191+
192+
ROUTER_PID=$!
193+
print_status "Router started with PID: $ROUTER_PID"
194+
195+
# Wait for router to be ready
196+
print_status "Waiting for router to be ready..."
197+
timeout 30 bash -c "until curl -s http://localhost:$ROUTER_PORT/v1/models > /dev/null 2>&1; do sleep 1; done" || {
198+
print_error "Router failed to start within 30 seconds"
199+
print_error "Router log:"
200+
tail -20 "$log_file" || true
201+
exit 1
202+
}
203+
print_status "Router is ready"
204+
}
205+
206+
# Function to run stress test
207+
run_stress_test() {
208+
print_status "Running stress test with Apache Bench"
209+
print_status "Concurrent: $CONCURRENT, Total: $REQUESTS"
210+
211+
# Create payload file
212+
local payload_file="/tmp/stress_payload.json"
213+
cat > "$payload_file" << EOF
214+
{
215+
"model": "$MODEL",
216+
"messages": [
217+
{"role": "user", "content": "Test message for stress testing"}
218+
],
219+
"max_tokens": 10,
220+
"temperature": 0.7
221+
}
222+
EOF
223+
224+
# Run Apache Bench
225+
ab -c "$CONCURRENT" \
226+
-n "$REQUESTS" \
227+
-p "$payload_file" \
228+
-T "application/json" \
229+
-H "Authorization: Bearer test" \
230+
-H "x-user-id: stress-test-user" \
231+
"http://localhost:$ROUTER_PORT/v1/chat/completions"
232+
233+
# Clean up payload file
234+
rm -f "$payload_file"
235+
236+
print_status "Stress test completed"
237+
238+
# Small delay to ensure all logs are written
239+
sleep 2
240+
}
241+
242+
# Function to check round-robin correctness
243+
check_roundrobin_correctness() {
244+
local log_file="$LOG_DIR/router.log"
245+
246+
print_status "Checking round-robin routing correctness..."
247+
248+
if [ ! -f "$log_file" ]; then
249+
print_error "Router log file not found: $log_file"
250+
return 1
251+
fi
252+
253+
# Extract backend routing decisions from logs
254+
# Look for "Routing request ... to http://localhost:XXXX"
255+
local backend1_count=$(grep -c "to http://localhost:$BACKEND1_PORT" "$log_file" || echo "0")
256+
local backend2_count=$(grep -c "to http://localhost:$BACKEND2_PORT" "$log_file" || echo "0")
257+
local total_routed=$((backend1_count + backend2_count))
258+
259+
print_status "Round-robin routing results:"
260+
print_status " Backend localhost:$BACKEND1_PORT: $backend1_count requests"
261+
print_status " Backend localhost:$BACKEND2_PORT: $backend2_count requests"
262+
print_status " Total routed: $total_routed requests"
263+
264+
if [ "$total_routed" -eq 0 ]; then
265+
print_error "No routing decisions found in logs"
266+
return 1
267+
fi
268+
269+
# Calculate percentages
270+
local backend1_pct=$((backend1_count * 100 / total_routed))
271+
local backend2_pct=$((backend2_count * 100 / total_routed))
272+
273+
print_status " Backend localhost:$BACKEND1_PORT: ${backend1_pct}%"
274+
print_status " Backend localhost:$BACKEND2_PORT: ${backend2_pct}%"
275+
276+
# Check if distribution is roughly even (within 20% tolerance)
277+
local diff=$((backend1_pct > backend2_pct ? backend1_pct - backend2_pct : backend2_pct - backend1_pct))
278+
279+
if [ "$diff" -le 20 ]; then
280+
print_status "✅ Round-robin routing is working correctly (${diff}% difference)"
281+
return 0
282+
else
283+
print_error "❌ Round-robin routing appears uneven (${diff}% difference)"
284+
print_status "Last 10 routing decisions from logs:"
285+
grep "Routing request.*to http://localhost:" "$log_file" | tail -10 | sed 's/^/ /' || true
286+
return 1
287+
fi
288+
}
289+
290+
# Function to show log summary
291+
show_log_summary() {
292+
local log_file="$LOG_DIR/router.log"
293+
294+
if [ -f "$log_file" ]; then
295+
print_status "Log summary (last 20 lines):"
296+
tail -20 "$log_file" | sed 's/^/ /'
297+
fi
298+
}
299+
300+
# Parse command line arguments
301+
while [[ $# -gt 0 ]]; do
302+
case $1 in
303+
-c|--concurrent)
304+
CONCURRENT="$2"
305+
shift 2
306+
;;
307+
-n|--requests)
308+
REQUESTS="$2"
309+
shift 2
310+
;;
311+
-p|--port)
312+
ROUTER_PORT="$2"
313+
shift 2
314+
;;
315+
-l|--log-dir)
316+
LOG_DIR="$2"
317+
shift 2
318+
;;
319+
-m|--model)
320+
MODEL="$2"
321+
shift 2
322+
;;
323+
--backend1-port)
324+
BACKEND1_PORT="$2"
325+
shift 2
326+
;;
327+
--backend2-port)
328+
BACKEND2_PORT="$2"
329+
shift 2
330+
;;
331+
-h|--help)
332+
show_usage
333+
exit 0
334+
;;
335+
*)
336+
print_error "Unknown option: $1"
337+
show_usage
338+
exit 1
339+
;;
340+
esac
341+
done
342+
343+
# Set trap for cleanup
344+
trap cleanup EXIT
345+
346+
# Update backends URL with final port values
347+
BACKENDS_URL="http://localhost:$BACKEND1_PORT,http://localhost:$BACKEND2_PORT"
348+
349+
# Check prerequisites
350+
print_status "Checking prerequisites..."
351+
check_ab
352+
353+
print_status "Router stress test configuration:"
354+
print_status " Concurrent requests: $CONCURRENT"
355+
print_status " Total requests: $REQUESTS"
356+
print_status " Router port: $ROUTER_PORT"
357+
print_status " Backend ports: $BACKEND1_PORT, $BACKEND2_PORT"
358+
print_status " Model: $MODEL"
359+
360+
# Run test
361+
start_router
362+
run_stress_test
363+
364+
# Check correctness and show results
365+
if check_roundrobin_correctness; then
366+
print_status "Test completed successfully!"
367+
else
368+
print_error "Test completed but round-robin routing correctness check failed!"
369+
show_log_summary
370+
exit 1
371+
fi

0 commit comments

Comments
 (0)