vllm-project
diff --git a/‎.github/values-06-session-routing.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/values-06-session-routing.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/values-07-prefix-routing.yaml
Lines changed: 36 additions & 0 deletions b/‎.github/values-07-prefix-routing.yaml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/values-08-roundrobin-routing.yaml
Lines changed: 36 additions & 0 deletions b/‎.github/values-08-roundrobin-routing.yaml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/values-09-kvaware-routing.yaml
Lines changed: 36 additions & 0 deletions b/‎.github/values-09-kvaware-routing.yaml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/workflows/router-e2e-test.yml
Lines changed: 14 additions & 55 deletions b/‎.github/workflows/router-e2e-test.yml
Lines changed: 14 additions & 55 deletions
diff --git a/‎docs/source/tutorials/disagg.rst
Lines changed: 2 additions & 2 deletions b/‎docs/source/tutorials/disagg.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎helm/templates/deployment-vllm-multi.yaml
Lines changed: 9 additions & 10 deletions b/‎helm/templates/deployment-vllm-multi.yaml
Lines changed: 9 additions & 10 deletions
diff --git a/‎src/gateway_inference_extension/README.md
Lines changed: 2 additions & 2 deletions b/‎src/gateway_inference_extension/README.md
Lines changed: 2 additions & 2 deletions
@@ -10,7 +10,7 @@ servingEngineSpec:
 
     replicaCount: 2
 
-    requestCPU: 6
+    requestCPU: 4
     requestMemory: "16Gi"
     requestGPU: 0.5
 
 
@@ -0,0 +1,36 @@
+servingEngineSpec:
+  strategy:
+    type: Recreate
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 2
+
+    requestCPU: 4
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteMany
+
+    vllmConfig:
+      maxModelLen: 1024
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+    chatTemplate: "chat.jinja2"
+    chatTemplateConfigMap: |-
+      {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+      {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
+  routingLogic: "prefixaware"
+  extraArgs:
+    - "--log-level"
+    - "info"
@@ -0,0 +1,36 @@
+servingEngineSpec:
+  strategy:
+    type: Recreate
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 2
+
+    requestCPU: 4
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteMany
+
+    vllmConfig:
+      maxModelLen: 1024
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+    chatTemplate: "chat.jinja2"
+    chatTemplateConfigMap: |-
+      {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+      {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
+  routingLogic: "roundrobin"
+  extraArgs:
+    - "--log-level"
+    - "info"
@@ -0,0 +1,36 @@
+servingEngineSpec:
+  strategy:
+    type: Recreate
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 2
+
+    requestCPU: 4
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteMany
+
+    vllmConfig:
+      maxModelLen: 1024
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+    chatTemplate: "chat.jinja2"
+    chatTemplateConfigMap: |-
+      {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+      {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
+  routingLogic: "kvaware"
+  extraArgs:
+    - "--log-level"
+    - "info"
@@ -101,7 +101,7 @@ jobs:
             ~/.kube/config
             src/tests/perftest/logs
 
-  sticky-routing-e2e-test:
+  k8s-discovery-e2e-test:
     runs-on: self-hosted
     needs: e2e-test
     if: github.event.pull_request.draft == false
@@ -141,68 +141,27 @@ jobs:
           sudo docker push localhost:5000/git-act-router
           minikube image load localhost:5000/git-act-router
 
-      - name: Deploy two-pods setup via helm charts
+      - name: Run all k8s discovery routing tests
         run: |
-          echo "🚀 Deploying two-pods setup with helm"
-          cd ${{ github.workspace }}
-          helm install vllm ./helm -f .github/values-06-session-routing.yaml
-
-      - name: Wait for pods to be ready
-        run: |
-          echo "⏳ Making wait-for-pods script executable and running it"
-          chmod +x .github/wait-for-pods.sh
-          ./.github/wait-for-pods.sh --pod-prefix vllm --timeout 300 --verbose
-
-      - name: Make test script executable
-        run: |
-          chmod +x tests/e2e/test-sticky-routing.sh
-
-      - name: Run sticky routing e2e test
-        run: |
-          echo "🧪 Running sticky routing test"
-          cd ${{ github.workspace }}
-          # Set the model to match what's deployed in the helm values
-          # Enable debug mode to preserve temp files for artifact collection
-          ./tests/e2e/test-sticky-routing.sh --model "facebook/opt-125m" --num-rounds 3 --verbose --debug
+          echo "🧪 Running all k8s discovery routing tests"
+          ./tests/e2e/run-k8s-routing-test.sh all \
+            --model "facebook/opt-125m" \
+            --num-requests 25 \
+            --chunk-size 128 \
+            --verbose \
+            --result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
+            --timeout 10
         timeout-minutes: 10
 
-      - name: Archive sticky routing test results
+      - name: Archive k8s discovery routing test results
         uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: sticky-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
           path: |
-            /tmp/sticky-routing-results-*
-
-      - name: Get router and pod logs for debugging
-        if: always()
-        run: |
-          echo "📋 Collecting logs for debugging"
-          mkdir -p debug-logs
-          # Get router logs
-          kubectl logs -l app.kubernetes.io/component=router --tail=100 > debug-logs/router.log || true
-          # Get serving engine logs
-          kubectl logs -l app.kubernetes.io/component=serving-engine --tail=100 > debug-logs/serving-engines.log || true
-          # Get pod status
-          kubectl get pods -o wide > debug-logs/pod-status.txt || true
-          # Get services
-          kubectl get svc > debug-logs/services.txt || true
-
-      - name: Upload debug logs
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: debug-logs-pr-${{ github.event.pull_request.number || 'main' }}
-          path: debug-logs/
-
-      - name: Helm uninstall and cleanup
-        run: |
-          echo "🧹 Cleaning up resources"
-          helm uninstall vllm || true
-          sudo docker image prune -f || true
-        if: always()
+            /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
 
-      - run: echo "🍏 Sticky routing e2e test job status is ${{ job.status }}."
+      - run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
 
   static-discovery-e2e-test:
     runs-on: self-hosted
 
@@ -134,7 +134,7 @@ Install the deployment using Helm with the configuration file:
 
 .. code-block:: bash
 
-    helm install pd helm/ -f tutorials/assets/values-16-disagg-prefill.yaml
+    helm install vllm helm/ -f tutorials/assets/values-16-disagg-prefill.yaml
 
 This will deploy:
 
@@ -172,7 +172,7 @@ First do port forwarding to access the service:
 
 .. code-block:: bash
 
-    kubectl port-forward svc/pd-router-service 30080:80
+    kubectl port-forward svc/vllm-router-service 30080:80
 
 And then send a request to the router by:
 
 
@@ -90,9 +90,7 @@ spec:
         - name: "vllm"
           image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}"
           securityContext:
-            capabilities:
-              add:
-                - SYS_PTRACE
+            {{- toYaml .Values.servingEngineSpec.containerSecurityContext | nindent 12 }}
           command:
           {{- if or
             (eq $modelSpec.tag "2025-05-27-v1")
@@ -188,11 +186,7 @@ spec:
           {{- end }}
           {{- if $modelSpec.chatTemplate }}
           - "--chat-template"
-          - {{ $modelSpec.chatTemplate | quote }}
-          {{- end }}
-          {{- if .Values.servingEngineSpec.containerSecurityContext }}
-          securityContext:
-            {{- toYaml .Values.servingEngineSpec.containerSecurityContext | nindent 12 }}
+          - "/templates/{{ $modelSpec.chatTemplate }}"
           {{- end }}
           imagePullPolicy: Always
           env:
@@ -367,9 +361,14 @@ spec:
           {{- end}}
           {{- end}}
           {{- if $modelSpec.chatTemplate }}
+          {{- if hasKey $modelSpec "chatTemplateConfigMap" }}
+          - name: {{ .Release.Name }}-{{ $modelSpec.name }}-chat-templates
+            mountPath: /templates
+          {{- else }}
           - name: vllm-templates
             mountPath: /templates
           {{- end }}
+          {{- end }}
           {{- if hasKey $modelSpec "extraVolumeMounts" }}
           {{- toYaml $modelSpec.extraVolumeMounts | nindent 10 }}
           {{- end }}
@@ -395,7 +394,7 @@ spec:
         {{- end}}
         {{- if $modelSpec.chatTemplate}}
         {{- if hasKey $modelSpec "chatTemplateConfigMap" }}
-        - name: {{ .Release.Name }}-chat-templates
+        - name: {{ .Release.Name }}-{{ $modelSpec.name }}-chat-templates
           configMap:
             name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
         {{- else }}
@@ -440,7 +439,7 @@ metadata:
   namespace: "{{ .Release.Namespace }}"
 data:
   {{ $modelSpec.chatTemplate }}: |-
-    {{ $modelSpec.chatTemplateConfigMap }}
+{{ $modelSpec.chatTemplateConfigMap | nindent 4 }}
 {{- end }}
 {{- end }}
 ---
 
@@ -30,8 +30,8 @@ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extens
 # Install KGateway with inference extension enabled
 helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true
 
-# Apply VLLM deployment
-kubectl apply -f configs/vllm/gpu-deployment.yaml
+# Apply VLLM deployment using the VLLMRuntime CRD
+kubectl apply -f configs/vllm/vllm-runtime.yaml
 
 # Apply inference model and pool resources
 kubectl apply -f configs/inferencemodel.yaml