Skip to content

Commit d6ae547

Browse files
authored
chore: setup for performance benchmarking (#3096)
# What does this PR do? 1. Added a simple mock openai-compat server that serves chat/completion 2. Add a benchmark server in EKS that includes mock inference server 3. Add locust (https://locust.io/) file for load testing ## Test Plan bash apply.sh kubectl port-forward service/locust-web-ui 8089:8089 Go to localhost:8089 to start a load test <img width="1392" height="334" alt="image" src="https://github.com/user-attachments/assets/d6aa3deb-583a-42ed-889b-751262b8e91c" /> <img width="1362" height="881" alt="image" src="https://github.com/user-attachments/assets/6a28b9b4-05e6-44e2-b504-07e60c12d35e" />
1 parent 2f51273 commit d6ae547

File tree

11 files changed

+1234
-3
lines changed

11 files changed

+1234
-3
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright (c) Meta Platforms, Inc. and affiliates.
4+
# All rights reserved.
5+
#
6+
# This source code is licensed under the terms described in the LICENSE file in
7+
# the root directory of this source tree.
8+
9+
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
10+
11+
export MOCK_INFERENCE_PORT=8080
12+
export STREAM_DELAY_SECONDS=0.005
13+
14+
export POSTGRES_USER=llamastack
15+
export POSTGRES_DB=llamastack
16+
export POSTGRES_PASSWORD=llamastack
17+
18+
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
19+
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
20+
21+
export MOCK_INFERENCE_MODEL=mock-inference
22+
23+
# Use llama-stack-benchmark-service as the benchmark server
24+
export LOCUST_HOST=http://llama-stack-benchmark-service:8323
25+
export LOCUST_BASE_PATH=/v1/openai/v1
26+
27+
# Use vllm-service as the benchmark server
28+
# export LOCUST_HOST=http://vllm-server:8000
29+
# export LOCUST_BASE_PATH=/v1
30+
31+
32+
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
33+
34+
set -euo pipefail
35+
set -x
36+
37+
# Deploy benchmark-specific components
38+
# Deploy OpenAI mock server
39+
kubectl create configmap openai-mock --from-file=openai-mock-server.py \
40+
--dry-run=client -o yaml | kubectl apply --validate=false -f -
41+
42+
envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
43+
44+
# Create configmap with our custom stack config
45+
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
46+
--dry-run=client -o yaml > stack-configmap.yaml
47+
48+
kubectl apply --validate=false -f stack-configmap.yaml
49+
50+
# Deploy our custom llama stack server (overriding the base one)
51+
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
52+
53+
# Deploy Locust load testing
54+
kubectl create configmap locust-script --from-file=locustfile.py \
55+
--dry-run=client -o yaml | kubectl apply --validate=false -f -
56+
57+
envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: locust-master
5+
labels:
6+
app: locust
7+
role: master
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
app: locust
13+
role: master
14+
template:
15+
metadata:
16+
labels:
17+
app: locust
18+
role: master
19+
spec:
20+
containers:
21+
- name: locust-master
22+
image: locustio/locust:2.31.8
23+
ports:
24+
- containerPort: 8089 # Web UI
25+
- containerPort: 5557 # Master communication
26+
env:
27+
- name: LOCUST_HOST
28+
value: "${LOCUST_HOST}"
29+
- name: LOCUST_LOCUSTFILE
30+
value: "/locust/locustfile.py"
31+
- name: LOCUST_WEB_HOST
32+
value: "0.0.0.0"
33+
- name: LOCUST_MASTER
34+
value: "true"
35+
- name: LOCUST_BASE_PATH
36+
value: "${LOCUST_BASE_PATH}"
37+
- name: INFERENCE_MODEL
38+
value: "${BENCHMARK_INFERENCE_MODEL}"
39+
volumeMounts:
40+
- name: locust-script
41+
mountPath: /locust
42+
command: ["locust"]
43+
args:
44+
- "--master"
45+
- "--web-host=0.0.0.0"
46+
- "--web-port=8089"
47+
- "--host=${LOCUST_HOST}"
48+
- "--locustfile=/locust/locustfile.py"
49+
volumes:
50+
- name: locust-script
51+
configMap:
52+
name: locust-script
53+
---
54+
apiVersion: apps/v1
55+
kind: Deployment
56+
metadata:
57+
name: locust-worker
58+
labels:
59+
app: locust
60+
role: worker
61+
spec:
62+
replicas: 2 # Start with 2 workers, can be scaled up
63+
selector:
64+
matchLabels:
65+
app: locust
66+
role: worker
67+
template:
68+
metadata:
69+
labels:
70+
app: locust
71+
role: worker
72+
spec:
73+
containers:
74+
- name: locust-worker
75+
image: locustio/locust:2.31.8
76+
env:
77+
- name: LOCUST_HOST
78+
value: "${LOCUST_HOST}"
79+
- name: LOCUST_LOCUSTFILE
80+
value: "/locust/locustfile.py"
81+
- name: LOCUST_MASTER_HOST
82+
value: "locust-master-service"
83+
- name: LOCUST_MASTER_PORT
84+
value: "5557"
85+
- name: INFERENCE_MODEL
86+
value: "${BENCHMARK_INFERENCE_MODEL}"
87+
- name: LOCUST_BASE_PATH
88+
value: "${LOCUST_BASE_PATH}"
89+
volumeMounts:
90+
- name: locust-script
91+
mountPath: /locust
92+
command: ["locust"]
93+
args:
94+
- "--worker"
95+
- "--master-host=locust-master-service"
96+
- "--master-port=5557"
97+
- "--locustfile=/locust/locustfile.py"
98+
volumes:
99+
- name: locust-script
100+
configMap:
101+
name: locust-script
102+
---
103+
apiVersion: v1
104+
kind: Service
105+
metadata:
106+
name: locust-master-service
107+
spec:
108+
selector:
109+
app: locust
110+
role: master
111+
ports:
112+
- name: web-ui
113+
port: 8089
114+
targetPort: 8089
115+
- name: master-comm
116+
port: 5557
117+
targetPort: 5557
118+
type: ClusterIP
119+
---
120+
apiVersion: v1
121+
kind: Service
122+
metadata:
123+
name: locust-web-ui
124+
spec:
125+
selector:
126+
app: locust
127+
role: master
128+
ports:
129+
- port: 8089
130+
targetPort: 8089
131+
type: ClusterIP # Keep internal, use port-forward to access
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the terms described in the LICENSE file in
5+
# the root directory of this source tree.
6+
7+
"""
8+
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
9+
"""
10+
11+
import random
12+
from locust import HttpUser, task, between
13+
import os
14+
15+
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
16+
17+
MODEL_ID = os.getenv("INFERENCE_MODEL")
18+
19+
class LlamaStackUser(HttpUser):
20+
wait_time = between(0.0, 0.0001)
21+
22+
def on_start(self):
23+
"""Setup authentication and test data."""
24+
# No auth required for benchmark server
25+
self.headers = {
26+
"Content-Type": "application/json"
27+
}
28+
29+
# Test messages of varying lengths
30+
self.test_messages = [
31+
[{"role": "user", "content": "Hi"}],
32+
[{"role": "user", "content": "What is the capital of France?"}],
33+
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
34+
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
35+
[
36+
{"role": "user", "content": "What is machine learning?"},
37+
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
38+
{"role": "user", "content": "Can you give me a practical example?"}
39+
]
40+
]
41+
42+
@task(weight=100)
43+
def chat_completion_streaming(self):
44+
"""Test streaming chat completion (20% of requests)."""
45+
messages = random.choice(self.test_messages)
46+
payload = {
47+
"model": MODEL_ID,
48+
"messages": messages,
49+
"stream": True,
50+
"max_tokens": 100
51+
}
52+
53+
with self.client.post(
54+
f"{base_path}/chat/completions",
55+
headers=self.headers,
56+
json=payload,
57+
stream=True,
58+
catch_response=True
59+
) as response:
60+
if response.status_code == 200:
61+
chunks_received = 0
62+
try:
63+
for line in response.iter_lines():
64+
if line:
65+
line_str = line.decode('utf-8')
66+
if line_str.startswith('data: '):
67+
chunks_received += 1
68+
if line_str.strip() == 'data: [DONE]':
69+
break
70+
71+
if chunks_received > 0:
72+
response.success()
73+
else:
74+
response.failure("No streaming chunks received")
75+
except Exception as e:
76+
response.failure(f"Streaming error: {e}")
77+
else:
78+
response.failure(f"HTTP {response.status_code}: {response.text}")
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: openai-mock
5+
labels:
6+
app: openai-mock
7+
spec:
8+
replicas: 1
9+
selector:
10+
matchLabels:
11+
app: openai-mock
12+
template:
13+
metadata:
14+
labels:
15+
app: openai-mock
16+
spec:
17+
containers:
18+
- name: openai-mock
19+
image: python:3.12-slim
20+
ports:
21+
- containerPort: ${MOCK_INFERENCE_PORT}
22+
env:
23+
- name: PORT
24+
value: "${MOCK_INFERENCE_PORT}"
25+
- name: MOCK_MODELS
26+
value: "${MOCK_INFERENCE_MODEL}"
27+
- name: STREAM_DELAY_SECONDS
28+
value: "${STREAM_DELAY_SECONDS}"
29+
command: ["sh", "-c"]
30+
args:
31+
- |
32+
pip install flask &&
33+
python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
34+
volumeMounts:
35+
- name: openai-mock-script
36+
mountPath: /app
37+
volumes:
38+
- name: openai-mock-script
39+
configMap:
40+
name: openai-mock
41+
---
42+
apiVersion: v1
43+
kind: Service
44+
metadata:
45+
name: openai-mock-service
46+
spec:
47+
selector:
48+
app: openai-mock
49+
ports:
50+
- port: 8080
51+
targetPort: 8080
52+
type: ClusterIP

0 commit comments

Comments
 (0)