vllm-project
diff --git a/‎.github/template-chatml.jinja
Lines changed: 2 additions & 0 deletions b/‎.github/template-chatml.jinja
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/values-06-session-routing.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/values-06-session-routing.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/values-07-prefix-routing.yaml
Lines changed: 36 additions & 0 deletions b/‎.github/values-07-prefix-routing.yaml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/values-08-roundrobin-routing.yaml
Lines changed: 36 additions & 0 deletions b/‎.github/values-08-roundrobin-routing.yaml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/values-09-kvaware-routing.yaml
Lines changed: 36 additions & 0 deletions b/‎.github/values-09-kvaware-routing.yaml
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/workflows/pre-commit.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/pre-commit.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/router-e2e-test.yml
Lines changed: 74 additions & 47 deletions b/‎.github/workflows/router-e2e-test.yml
Lines changed: 74 additions & 47 deletions
@@ -0,0 +1,2 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
@@ -10,7 +10,7 @@ servingEngineSpec:
 
     replicaCount: 2
 
-    requestCPU: 6
+    requestCPU: 4
     requestMemory: "16Gi"
     requestGPU: 0.5
 
 
@@ -0,0 +1,36 @@
+servingEngineSpec:
+  strategy:
+    type: Recreate
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 2
+
+    requestCPU: 4
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteMany
+
+    vllmConfig:
+      maxModelLen: 1024
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+    chatTemplate: "chat.jinja2"
+    chatTemplateConfigMap: |-
+      {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+      {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
+  routingLogic: "prefixaware"
+  extraArgs:
+    - "--log-level"
+    - "info"
@@ -0,0 +1,36 @@
+servingEngineSpec:
+  strategy:
+    type: Recreate
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 2
+
+    requestCPU: 4
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteMany
+
+    vllmConfig:
+      maxModelLen: 1024
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+    chatTemplate: "chat.jinja2"
+    chatTemplateConfigMap: |-
+      {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+      {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
+  routingLogic: "roundrobin"
+  extraArgs:
+    - "--log-level"
+    - "info"
@@ -0,0 +1,36 @@
+servingEngineSpec:
+  strategy:
+    type: Recreate
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 2
+
+    requestCPU: 4
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteMany
+
+    vllmConfig:
+      maxModelLen: 1024
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+    chatTemplate: "chat.jinja2"
+    chatTemplateConfigMap: |-
+      {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+      {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
+  routingLogic: "kvaware"
+  extraArgs:
+    - "--log-level"
+    - "info"
@@ -15,6 +15,8 @@ jobs:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+    - run: pip install ruff
+    - run: ruff check src/tests
 
   pre-commit-manual:
     strategy:
 
@@ -101,7 +101,7 @@ jobs:
             ~/.kube/config
             src/tests/perftest/logs
 
-  sticky-routing-e2e-test:
+  k8s-discovery-e2e-test:
     runs-on: self-hosted
     needs: e2e-test
     if: github.event.pull_request.draft == false
@@ -141,65 +141,92 @@ jobs:
           sudo docker push localhost:5000/git-act-router
           minikube image load localhost:5000/git-act-router
 
-      - name: Deploy two-pods setup via helm charts
+      - name: Run all k8s discovery routing tests
         run: |
-          echo "🚀 Deploying two-pods setup with helm"
-          cd ${{ github.workspace }}
-          helm install vllm ./helm -f .github/values-06-session-routing.yaml
+          echo "🧪 Running all k8s discovery routing tests"
+          ./tests/e2e/run-k8s-routing-test.sh all \
+            --model "facebook/opt-125m" \
+            --num-requests 25 \
+            --chunk-size 128 \
+            --verbose \
+            --result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
+            --timeout 10
+        timeout-minutes: 10
+
+      - name: Archive k8s discovery routing test results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          path: |
+            /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
+
+      - run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
+
+  static-discovery-e2e-test:
+    runs-on: self-hosted
+    needs: e2e-test
+    if: github.event.pull_request.draft == false
+    env:
+      LOG_DIR: /tmp/static-discovery-e2e-test-${{ github.event.pull_request.number || 'main' }}
 
-      - name: Wait for pods to be ready
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Python dependencies
         run: |
-          echo "⏳ Making wait-for-pods script executable and running it"
-          chmod +x .github/wait-for-pods.sh
-          ./.github/wait-for-pods.sh --pod-prefix vllm --timeout 300 --verbose
+          python -m pip install --upgrade pip
+          pip install -e .
 
-      - name: Make test script executable
+      - name: Install vLLM
         run: |
-          chmod +x tests/e2e/test-sticky-routing.sh
+          pip install vllm
 
-      - name: Run sticky routing e2e test
+      - name: Start 2 vLLM serve backends
         run: |
-          echo "🧪 Running sticky routing test"
-          cd ${{ github.workspace }}
-          # Set the model to match what's deployed in the helm values
-          # Enable debug mode to preserve temp files for artifact collection
-          ./tests/e2e/test-sticky-routing.sh --model "facebook/opt-125m" --num-rounds 3 --verbose --debug
-        timeout-minutes: 10
+          echo "🚀 Starting vLLM serve backend"
+          mkdir -p "$LOG_DIR"
+          CUDA_VISIBLE_DEVICES=0 vllm serve facebook/opt-125m --port 8001 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend1.log" 2>&1 &
+          CUDA_VISIBLE_DEVICES=1 vllm serve facebook/opt-125m --port 8002 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend2.log" 2>&1 &
 
-      - name: Archive sticky routing test results
+      - name: Wait for backends to be ready
+        run: |
+          echo "⏳ Waiting for backends to be ready"
+          chmod +x tests/e2e/wait-for-backends.sh
+          ./tests/e2e/wait-for-backends.sh 180 "http://localhost:8001" "http://localhost:8002"
+
+      - name: Run All Static Discovery Routing Tests
+        env:
+          PYTHONPATH: ${{ github.workspace }}/src
+        run: |
+          echo "🧪 Running all static discovery routing tests sequentially"
+          chmod +x tests/e2e/run-static-discovery-routing-test.sh
+          ./tests/e2e/run-static-discovery-routing-test.sh all \
+            --pythonpath "$PYTHONPATH" \
+            --log-dir "$LOG_DIR" \
+            --num-requests 20 \
+            --verbose
+        timeout-minutes: 5
+
+      - name: Archive static discovery test results and logs
         uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: sticky-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          name: static-discovery-test-results-pr-${{ github.event.pull_request.number || 'main' }}
           path: |
-            /tmp/sticky-routing-results-*
+            ${{ env.LOG_DIR }}/*
 
-      - name: Get router and pod logs for debugging
-        if: always()
-        run: |
-          echo "📋 Collecting logs for debugging"
-          mkdir -p debug-logs
-          # Get router logs
-          kubectl logs -l app.kubernetes.io/component=router --tail=100 > debug-logs/router.log || true
-          # Get serving engine logs
-          kubectl logs -l app.kubernetes.io/component=serving-engine --tail=100 > debug-logs/serving-engines.log || true
-          # Get pod status
-          kubectl get pods -o wide > debug-logs/pod-status.txt || true
-          # Get services
-          kubectl get svc > debug-logs/services.txt || true
-
-      - name: Upload debug logs
-        uses: actions/upload-artifact@v4
+      - name: Cleanup processes
         if: always()
-        with:
-          name: debug-logs-pr-${{ github.event.pull_request.number || 'main' }}
-          path: debug-logs/
-
-      - name: Helm uninstall and cleanup
         run: |
-          echo "🧹 Cleaning up resources"
-          helm uninstall vllm || true
-          sudo docker image prune -f || true
-        if: always()
+          echo "🧹 Cleaning up processes"
+          pkill -f "vllm serve" || true
+          pkill -f "python3 -m src.vllm_router.app" || true
 
-      - run: echo "🍏 Sticky routing e2e test job status is ${{ job.status }}."
+      - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{% for message in messages %}{{'<\|im_start\|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<\|im_end\|>' + '\n'}}{% endif %}{% endfor %}`
	`2`	`+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<\|im_start\|>assistant\n' }}{% endif %}`