Skip to content

Commit db38fc4

Browse files
author
BrianPark314
committed
Merge branch 'main' into feature/prefix-aware-routing
# Conflicts: # src/gateway_inference_extension/configs/vllm/vllm-runtime.yaml
2 parents bca73e5 + 40b9902 commit db38fc4

File tree

66 files changed

+4835
-1373
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+4835
-1373
lines changed

.github/template-chatml.jinja

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2+
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}

.github/values-06-session-routing.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ servingEngineSpec:
1010

1111
replicaCount: 2
1212

13-
requestCPU: 6
13+
requestCPU: 4
1414
requestMemory: "16Gi"
1515
requestGPU: 0.5
1616

.github/values-07-prefix-routing.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
servingEngineSpec:
2+
strategy:
3+
type: Recreate
4+
runtimeClassName: ""
5+
modelSpec:
6+
- name: "opt125m"
7+
repository: "vllm/vllm-openai"
8+
tag: "latest"
9+
modelURL: "facebook/opt-125m"
10+
11+
replicaCount: 2
12+
13+
requestCPU: 4
14+
requestMemory: "16Gi"
15+
requestGPU: 1
16+
17+
pvcStorage: "10Gi"
18+
pvcAccessMode:
19+
- ReadWriteMany
20+
21+
vllmConfig:
22+
maxModelLen: 1024
23+
extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
24+
chatTemplate: "chat.jinja2"
25+
chatTemplateConfigMap: |-
26+
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
27+
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
28+
29+
routerSpec:
30+
repository: "localhost:5000/git-act-router"
31+
imagePullPolicy: "IfNotPresent"
32+
enableRouter: true
33+
routingLogic: "prefixaware"
34+
extraArgs:
35+
- "--log-level"
36+
- "info"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
servingEngineSpec:
2+
strategy:
3+
type: Recreate
4+
runtimeClassName: ""
5+
modelSpec:
6+
- name: "opt125m"
7+
repository: "vllm/vllm-openai"
8+
tag: "latest"
9+
modelURL: "facebook/opt-125m"
10+
11+
replicaCount: 2
12+
13+
requestCPU: 4
14+
requestMemory: "16Gi"
15+
requestGPU: 1
16+
17+
pvcStorage: "10Gi"
18+
pvcAccessMode:
19+
- ReadWriteMany
20+
21+
vllmConfig:
22+
maxModelLen: 1024
23+
extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
24+
chatTemplate: "chat.jinja2"
25+
chatTemplateConfigMap: |-
26+
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
27+
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
28+
29+
routerSpec:
30+
repository: "localhost:5000/git-act-router"
31+
imagePullPolicy: "IfNotPresent"
32+
enableRouter: true
33+
routingLogic: "roundrobin"
34+
extraArgs:
35+
- "--log-level"
36+
- "info"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
servingEngineSpec:
2+
strategy:
3+
type: Recreate
4+
runtimeClassName: ""
5+
modelSpec:
6+
- name: "opt125m"
7+
repository: "vllm/vllm-openai"
8+
tag: "latest"
9+
modelURL: "facebook/opt-125m"
10+
11+
replicaCount: 2
12+
13+
requestCPU: 4
14+
requestMemory: "16Gi"
15+
requestGPU: 1
16+
17+
pvcStorage: "10Gi"
18+
pvcAccessMode:
19+
- ReadWriteMany
20+
21+
vllmConfig:
22+
maxModelLen: 1024
23+
extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
24+
chatTemplate: "chat.jinja2"
25+
chatTemplateConfigMap: |-
26+
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
27+
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
28+
29+
routerSpec:
30+
repository: "localhost:5000/git-act-router"
31+
imagePullPolicy: "IfNotPresent"
32+
enableRouter: true
33+
routingLogic: "kvaware"
34+
extraArgs:
35+
- "--log-level"
36+
- "info"

.github/workflows/pre-commit.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ jobs:
1515
python-version: "3.12"
1616
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
1717
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
18+
- run: pip install ruff
19+
- run: ruff check src/tests
1820

1921
pre-commit-manual:
2022
strategy:

.github/workflows/router-e2e-test.yml

Lines changed: 74 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ jobs:
101101
~/.kube/config
102102
src/tests/perftest/logs
103103
104-
sticky-routing-e2e-test:
104+
k8s-discovery-e2e-test:
105105
runs-on: self-hosted
106106
needs: e2e-test
107107
if: github.event.pull_request.draft == false
@@ -141,65 +141,92 @@ jobs:
141141
sudo docker push localhost:5000/git-act-router
142142
minikube image load localhost:5000/git-act-router
143143
144-
- name: Deploy two-pods setup via helm charts
144+
- name: Run all k8s discovery routing tests
145145
run: |
146-
echo "🚀 Deploying two-pods setup with helm"
147-
cd ${{ github.workspace }}
148-
helm install vllm ./helm -f .github/values-06-session-routing.yaml
146+
echo "🧪 Running all k8s discovery routing tests"
147+
./tests/e2e/run-k8s-routing-test.sh all \
148+
--model "facebook/opt-125m" \
149+
--num-requests 25 \
150+
--chunk-size 128 \
151+
--verbose \
152+
--result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
153+
--timeout 10
154+
timeout-minutes: 10
155+
156+
- name: Archive k8s discovery routing test results
157+
uses: actions/upload-artifact@v4
158+
if: always()
159+
with:
160+
name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
161+
path: |
162+
/tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
163+
164+
- run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
165+
166+
static-discovery-e2e-test:
167+
runs-on: self-hosted
168+
needs: e2e-test
169+
if: github.event.pull_request.draft == false
170+
env:
171+
LOG_DIR: /tmp/static-discovery-e2e-test-${{ github.event.pull_request.number || 'main' }}
149172

150-
- name: Wait for pods to be ready
173+
steps:
174+
- name: Check out repository code
175+
uses: actions/checkout@v4
176+
177+
- name: Setup Python
178+
uses: actions/setup-python@v5
179+
with:
180+
python-version: "3.12"
181+
182+
- name: Install Python dependencies
151183
run: |
152-
echo "⏳ Making wait-for-pods script executable and running it"
153-
chmod +x .github/wait-for-pods.sh
154-
./.github/wait-for-pods.sh --pod-prefix vllm --timeout 300 --verbose
184+
python -m pip install --upgrade pip
185+
pip install -e .
155186
156-
- name: Make test script executable
187+
- name: Install vLLM
157188
run: |
158-
chmod +x tests/e2e/test-sticky-routing.sh
189+
pip install vllm
159190
160-
- name: Run sticky routing e2e test
191+
- name: Start 2 vLLM serve backends
161192
run: |
162-
echo "🧪 Running sticky routing test"
163-
cd ${{ github.workspace }}
164-
# Set the model to match what's deployed in the helm values
165-
# Enable debug mode to preserve temp files for artifact collection
166-
./tests/e2e/test-sticky-routing.sh --model "facebook/opt-125m" --num-rounds 3 --verbose --debug
167-
timeout-minutes: 10
193+
echo "🚀 Starting vLLM serve backend"
194+
mkdir -p "$LOG_DIR"
195+
CUDA_VISIBLE_DEVICES=0 vllm serve facebook/opt-125m --port 8001 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend1.log" 2>&1 &
196+
CUDA_VISIBLE_DEVICES=1 vllm serve facebook/opt-125m --port 8002 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend2.log" 2>&1 &
168197
169-
- name: Archive sticky routing test results
198+
- name: Wait for backends to be ready
199+
run: |
200+
echo "⏳ Waiting for backends to be ready"
201+
chmod +x tests/e2e/wait-for-backends.sh
202+
./tests/e2e/wait-for-backends.sh 180 "http://localhost:8001" "http://localhost:8002"
203+
204+
- name: Run All Static Discovery Routing Tests
205+
env:
206+
PYTHONPATH: ${{ github.workspace }}/src
207+
run: |
208+
echo "🧪 Running all static discovery routing tests sequentially"
209+
chmod +x tests/e2e/run-static-discovery-routing-test.sh
210+
./tests/e2e/run-static-discovery-routing-test.sh all \
211+
--pythonpath "$PYTHONPATH" \
212+
--log-dir "$LOG_DIR" \
213+
--num-requests 20 \
214+
--verbose
215+
timeout-minutes: 5
216+
217+
- name: Archive static discovery test results and logs
170218
uses: actions/upload-artifact@v4
171219
if: always()
172220
with:
173-
name: sticky-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
221+
name: static-discovery-test-results-pr-${{ github.event.pull_request.number || 'main' }}
174222
path: |
175-
/tmp/sticky-routing-results-*
223+
${{ env.LOG_DIR }}/*
176224
177-
- name: Get router and pod logs for debugging
178-
if: always()
179-
run: |
180-
echo "📋 Collecting logs for debugging"
181-
mkdir -p debug-logs
182-
# Get router logs
183-
kubectl logs -l app.kubernetes.io/component=router --tail=100 > debug-logs/router.log || true
184-
# Get serving engine logs
185-
kubectl logs -l app.kubernetes.io/component=serving-engine --tail=100 > debug-logs/serving-engines.log || true
186-
# Get pod status
187-
kubectl get pods -o wide > debug-logs/pod-status.txt || true
188-
# Get services
189-
kubectl get svc > debug-logs/services.txt || true
190-
191-
- name: Upload debug logs
192-
uses: actions/upload-artifact@v4
225+
- name: Cleanup processes
193226
if: always()
194-
with:
195-
name: debug-logs-pr-${{ github.event.pull_request.number || 'main' }}
196-
path: debug-logs/
197-
198-
- name: Helm uninstall and cleanup
199227
run: |
200-
echo "🧹 Cleaning up resources"
201-
helm uninstall vllm || true
202-
sudo docker image prune -f || true
203-
if: always()
228+
echo "🧹 Cleaning up processes"
229+
pkill -f "vllm serve" || true
230+
pkill -f "python3 -m src.vllm_router.app" || true
204231
205-
- run: echo "🍏 Sticky routing e2e test job status is ${{ job.status }}."
232+
- run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."

0 commit comments

Comments
 (0)