Skip to content

Commit 40b9902

Browse files
author
BrianPark314
committed
Merge remote-tracking branch 'origin/main'
2 parents 2d8c586 + 49956c1 commit 40b9902

File tree

15 files changed

+774
-99
lines changed

15 files changed

+774
-99
lines changed

.github/values-06-session-routing.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ servingEngineSpec:
1010

1111
replicaCount: 2
1212

13-
requestCPU: 6
13+
requestCPU: 4
1414
requestMemory: "16Gi"
1515
requestGPU: 0.5
1616

.github/values-07-prefix-routing.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
servingEngineSpec:
2+
strategy:
3+
type: Recreate
4+
runtimeClassName: ""
5+
modelSpec:
6+
- name: "opt125m"
7+
repository: "vllm/vllm-openai"
8+
tag: "latest"
9+
modelURL: "facebook/opt-125m"
10+
11+
replicaCount: 2
12+
13+
requestCPU: 4
14+
requestMemory: "16Gi"
15+
requestGPU: 1
16+
17+
pvcStorage: "10Gi"
18+
pvcAccessMode:
19+
- ReadWriteMany
20+
21+
vllmConfig:
22+
maxModelLen: 1024
23+
extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
24+
chatTemplate: "chat.jinja2"
25+
chatTemplateConfigMap: |-
26+
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
27+
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
28+
29+
routerSpec:
30+
repository: "localhost:5000/git-act-router"
31+
imagePullPolicy: "IfNotPresent"
32+
enableRouter: true
33+
routingLogic: "prefixaware"
34+
extraArgs:
35+
- "--log-level"
36+
- "info"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
servingEngineSpec:
2+
strategy:
3+
type: Recreate
4+
runtimeClassName: ""
5+
modelSpec:
6+
- name: "opt125m"
7+
repository: "vllm/vllm-openai"
8+
tag: "latest"
9+
modelURL: "facebook/opt-125m"
10+
11+
replicaCount: 2
12+
13+
requestCPU: 4
14+
requestMemory: "16Gi"
15+
requestGPU: 1
16+
17+
pvcStorage: "10Gi"
18+
pvcAccessMode:
19+
- ReadWriteMany
20+
21+
vllmConfig:
22+
maxModelLen: 1024
23+
extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
24+
chatTemplate: "chat.jinja2"
25+
chatTemplateConfigMap: |-
26+
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
27+
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
28+
29+
routerSpec:
30+
repository: "localhost:5000/git-act-router"
31+
imagePullPolicy: "IfNotPresent"
32+
enableRouter: true
33+
routingLogic: "roundrobin"
34+
extraArgs:
35+
- "--log-level"
36+
- "info"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
servingEngineSpec:
2+
strategy:
3+
type: Recreate
4+
runtimeClassName: ""
5+
modelSpec:
6+
- name: "opt125m"
7+
repository: "vllm/vllm-openai"
8+
tag: "latest"
9+
modelURL: "facebook/opt-125m"
10+
11+
replicaCount: 2
12+
13+
requestCPU: 4
14+
requestMemory: "16Gi"
15+
requestGPU: 1
16+
17+
pvcStorage: "10Gi"
18+
pvcAccessMode:
19+
- ReadWriteMany
20+
21+
vllmConfig:
22+
maxModelLen: 1024
23+
extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
24+
chatTemplate: "chat.jinja2"
25+
chatTemplateConfigMap: |-
26+
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
27+
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
28+
29+
routerSpec:
30+
repository: "localhost:5000/git-act-router"
31+
imagePullPolicy: "IfNotPresent"
32+
enableRouter: true
33+
routingLogic: "kvaware"
34+
extraArgs:
35+
- "--log-level"
36+
- "info"

.github/workflows/router-e2e-test.yml

Lines changed: 14 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ jobs:
101101
~/.kube/config
102102
src/tests/perftest/logs
103103
104-
sticky-routing-e2e-test:
104+
k8s-discovery-e2e-test:
105105
runs-on: self-hosted
106106
needs: e2e-test
107107
if: github.event.pull_request.draft == false
@@ -141,68 +141,27 @@ jobs:
141141
sudo docker push localhost:5000/git-act-router
142142
minikube image load localhost:5000/git-act-router
143143
144-
- name: Deploy two-pods setup via helm charts
144+
- name: Run all k8s discovery routing tests
145145
run: |
146-
echo "🚀 Deploying two-pods setup with helm"
147-
cd ${{ github.workspace }}
148-
helm install vllm ./helm -f .github/values-06-session-routing.yaml
149-
150-
- name: Wait for pods to be ready
151-
run: |
152-
echo "⏳ Making wait-for-pods script executable and running it"
153-
chmod +x .github/wait-for-pods.sh
154-
./.github/wait-for-pods.sh --pod-prefix vllm --timeout 300 --verbose
155-
156-
- name: Make test script executable
157-
run: |
158-
chmod +x tests/e2e/test-sticky-routing.sh
159-
160-
- name: Run sticky routing e2e test
161-
run: |
162-
echo "🧪 Running sticky routing test"
163-
cd ${{ github.workspace }}
164-
# Set the model to match what's deployed in the helm values
165-
# Enable debug mode to preserve temp files for artifact collection
166-
./tests/e2e/test-sticky-routing.sh --model "facebook/opt-125m" --num-rounds 3 --verbose --debug
146+
echo "🧪 Running all k8s discovery routing tests"
147+
./tests/e2e/run-k8s-routing-test.sh all \
148+
--model "facebook/opt-125m" \
149+
--num-requests 25 \
150+
--chunk-size 128 \
151+
--verbose \
152+
--result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
153+
--timeout 10
167154
timeout-minutes: 10
168155

169-
- name: Archive sticky routing test results
156+
- name: Archive k8s discovery routing test results
170157
uses: actions/upload-artifact@v4
171158
if: always()
172159
with:
173-
name: sticky-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
160+
name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
174161
path: |
175-
/tmp/sticky-routing-results-*
176-
177-
- name: Get router and pod logs for debugging
178-
if: always()
179-
run: |
180-
echo "📋 Collecting logs for debugging"
181-
mkdir -p debug-logs
182-
# Get router logs
183-
kubectl logs -l app.kubernetes.io/component=router --tail=100 > debug-logs/router.log || true
184-
# Get serving engine logs
185-
kubectl logs -l app.kubernetes.io/component=serving-engine --tail=100 > debug-logs/serving-engines.log || true
186-
# Get pod status
187-
kubectl get pods -o wide > debug-logs/pod-status.txt || true
188-
# Get services
189-
kubectl get svc > debug-logs/services.txt || true
190-
191-
- name: Upload debug logs
192-
uses: actions/upload-artifact@v4
193-
if: always()
194-
with:
195-
name: debug-logs-pr-${{ github.event.pull_request.number || 'main' }}
196-
path: debug-logs/
197-
198-
- name: Helm uninstall and cleanup
199-
run: |
200-
echo "🧹 Cleaning up resources"
201-
helm uninstall vllm || true
202-
sudo docker image prune -f || true
203-
if: always()
162+
/tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
204163
205-
- run: echo "🍏 Sticky routing e2e test job status is ${{ job.status }}."
164+
- run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
206165

207166
static-discovery-e2e-test:
208167
runs-on: self-hosted

docs/source/tutorials/disagg.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ Install the deployment using Helm with the configuration file:
134134

135135
.. code-block:: bash
136136
137-
helm install pd helm/ -f tutorials/assets/values-16-disagg-prefill.yaml
137+
helm install vllm helm/ -f tutorials/assets/values-16-disagg-prefill.yaml
138138
139139
This will deploy:
140140

@@ -172,7 +172,7 @@ First do port forwarding to access the service:
172172

173173
.. code-block:: bash
174174
175-
kubectl port-forward svc/pd-router-service 30080:80
175+
kubectl port-forward svc/vllm-router-service 30080:80
176176
177177
And then send a request to the router by:
178178

helm/templates/deployment-vllm-multi.yaml

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,7 @@ spec:
9090
- name: "vllm"
9191
image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}"
9292
securityContext:
93-
capabilities:
94-
add:
95-
- SYS_PTRACE
93+
{{- toYaml .Values.servingEngineSpec.containerSecurityContext | nindent 12 }}
9694
command:
9795
{{- if or
9896
(eq $modelSpec.tag "2025-05-27-v1")
@@ -188,11 +186,7 @@ spec:
188186
{{- end }}
189187
{{- if $modelSpec.chatTemplate }}
190188
- "--chat-template"
191-
- {{ $modelSpec.chatTemplate | quote }}
192-
{{- end }}
193-
{{- if .Values.servingEngineSpec.containerSecurityContext }}
194-
securityContext:
195-
{{- toYaml .Values.servingEngineSpec.containerSecurityContext | nindent 12 }}
189+
- "/templates/{{ $modelSpec.chatTemplate }}"
196190
{{- end }}
197191
imagePullPolicy: Always
198192
env:
@@ -367,9 +361,14 @@ spec:
367361
{{- end}}
368362
{{- end}}
369363
{{- if $modelSpec.chatTemplate }}
364+
{{- if hasKey $modelSpec "chatTemplateConfigMap" }}
365+
- name: {{ .Release.Name }}-{{ $modelSpec.name }}-chat-templates
366+
mountPath: /templates
367+
{{- else }}
370368
- name: vllm-templates
371369
mountPath: /templates
372370
{{- end }}
371+
{{- end }}
373372
{{- if hasKey $modelSpec "extraVolumeMounts" }}
374373
{{- toYaml $modelSpec.extraVolumeMounts | nindent 10 }}
375374
{{- end }}
@@ -395,7 +394,7 @@ spec:
395394
{{- end}}
396395
{{- if $modelSpec.chatTemplate}}
397396
{{- if hasKey $modelSpec "chatTemplateConfigMap" }}
398-
- name: {{ .Release.Name }}-chat-templates
397+
- name: {{ .Release.Name }}-{{ $modelSpec.name }}-chat-templates
399398
configMap:
400399
name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
401400
{{- else }}
@@ -440,7 +439,7 @@ metadata:
440439
namespace: "{{ .Release.Namespace }}"
441440
data:
442441
{{ $modelSpec.chatTemplate }}: |-
443-
{{ $modelSpec.chatTemplateConfigMap }}
442+
{{ $modelSpec.chatTemplateConfigMap | nindent 4 }}
444443
{{- end }}
445444
{{- end }}
446445
---

src/gateway_inference_extension/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
3030
# Install KGateway with inference extension enabled
3131
helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true
3232

33-
# Apply VLLM deployment
34-
kubectl apply -f configs/vllm/gpu-deployment.yaml
33+
# Apply VLLM deployment using the VLLMRuntime CRD
34+
kubectl apply -f configs/vllm/vllm-runtime.yaml
3535

3636
# Apply inference model and pool resources
3737
kubectl apply -f configs/inferencemodel.yaml

0 commit comments

Comments
 (0)