vllm-project · YuhanLiu11 · Aug 6, 2025 · Jun 14, 2025 · Jun 14, 2025 · Jun 14, 2025
diff --git a/src/gateway_inference_extension/Dockerfile b/src/gateway_inference_extension/Dockerfile
@@ -25,6 +25,7 @@ RUN git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
     git apply scheduler.patch && \
     cd ../../../.. && \
     cp /src/roundrobin_picker.go  gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker/roundrobin_picker.go && \
+    cp /src/prefix_aware_picker.go  gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker/prefix_aware_picker.go && \
     mkdir -p /src/pkg/ && \
     cp -r gateway-api-inference-extension/pkg/epp/ /src/pkg/epp && \
     cp gateway-api-inference-extension/go.mod /src && \

diff --git a/src/gateway_inference_extension/configs/vllm/vllm-runtime.yaml b/src/gateway_inference_extension/configs/vllm/vllm-runtime.yaml
@@ -0,0 +1,100 @@
+apiVersion: production-stack.vllm.ai/v1alpha1
+kind: VLLMRuntime
+metadata:
+  name: vllm-llama3-1b-instruct        # must match Gateway route back-reference
+  labels:
+    app.kubernetes.io/component: model-server
+    app.kubernetes.io/part-of: inference-gateway-demo
+spec:
+  # --- Core vLLM flags --------------------------------------------------
+  v1: true                           # keep vLLM v1 API surface on
+  tensorParallelSize: 1
+  gpuMemoryUtilization: "0.9"
+  maxLoras: 2
+  extraArgs:
+    - "--max-num-seq"                 # identical to old Deployment
+    - "1024"
+    - "--compilation-config"
+    - "3"
+    - "--max-lora-rank"
+    - "32"
+    - "--max-cpu-loras"
+    - "12"
+
+  # --- Model ------------------------------------------------------------
+  model:
+    modelURL: "meta-llama/Llama-3.2-1B-Instruct"
+    enableLoRA: true
+    dtype: "bfloat16"
+    maxModelLen: 4096
+    maxNumSeqs: 1024                 # duplicated for clarity
+
+  # --- LoRA & cache off-loading ----------------------------------------
+  lmCacheConfig:
+    enabled: true
+    remoteUrl: "lm://cacheserver-sample.default.svc.cluster.local:80"
+    remoteSerde: "naive"
+    cpuOffloadingBufferSize: "15"
+    diskOffloadingBufferSize: "0"
+
+  # --- Runtime image ----------------------------------------------------
+  image:
+    registry: "docker.io"
+    name: "lmcache/vllm-openai:2025-05-05-v1"
+    pullPolicy: "IfNotPresent"
+
+  # --- Resources --------------------------------------------------------
+  resources:
+    cpu: "8"
+    memory: "24Gi"
+    gpu: "1"
+
+  # --- Secret & env -----------------------------------------------------
+  hfTokenSecret:
+    name: "hf-token"
+  env:
+    - name: VLLM_USE_V1
+      value: "1"
+    - name: PORT
+      value: "8000"
+    - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+      value: "true"
+
+  # --- Replication & strategy ------------------------------------------
+  replicas: 2
+  deploymentStrategy: "RollingUpdate"
+
+  # --- Pod-level customisation (for adapter syncer & volumes) ----------
+  podTemplate:
+    spec:
+      enableServiceLinks: false      # avoid VLLM_PORT collision
+      terminationGracePeriodSeconds: 130
+      volumes:
+        - name: data
+          emptyDir: {}
+        - name: shm
+          emptyDir:
+            medium: Memory
+        - name: adapters
+          emptyDir: {}
+        - name: config-volume
+          configMap:
+            name: vllm-llama3-1b-instruct-adapters
+      initContainers:
+        - name: lora-adapter-syncer
+          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
+          env:
+            - name: DYNAMIC_LORA_ROLLOUT_CONFIG
+              value: "/config/configmap.yaml"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /config
+      containers:
+        - name: vllm
+          volumeMounts:
+            - name: data
+              mountPath: /data
+            - name: shm
+              mountPath: /dev/shm
+            - name: adapters
+              mountPath: /adapters
diff --git a/src/gateway_inference_extension/prefix_aware_picker.go b/src/gateway_inference_extension/prefix_aware_picker.go
@@ -0,0 +1,152 @@
+/*
+Copyright 2025 The vLLM Production Stack Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+*/
+
+package picker
+
+import (
+	"math/rand"
+	"sync"
+	"time"
+
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+var _ plugins.Picker = &PrefixMatchPicker{}
+
+// PrefixMatchPicker selects the engine whose URL was returned by the
+// longest-prefix match against previously-seen prompts (same idea as the
+// Python `route_request`). Ties are broken at random.
+type PrefixMatchPicker struct {
+	trie *hashTrie
+	rnd  *rand.Rand
+}
+
+// NewPrefixMatchPicker returns a ready-to-use picker instance.
+func NewPrefixMatchPicker() *PrefixMatchPicker {
+	return &PrefixMatchPicker{
+		trie: newHashTrie(),
+		rnd:  rand.New(rand.NewSource(time.Now().UnixNano())),
+	}
+}
+
+func (p *PrefixMatchPicker) Name() string { return "prefixmatch" }
+
+// Pick implements plugins.Picker.
+//
+// SchedulingContext is assumed to carry the inference request body in
+// ctx.RequestBody (map[string]any) with the prompt at key "prompt".  Adjust
+// the accessor if your integration differs.
+func (p *PrefixMatchPicker) Pick(
+	ctx *types.SchedulingContext,
+	scoredPods []*types.ScoredPod,
+) *types.Result {
+	if len(scoredPods) == 0 {
+		return &types.Result{}
+	}
+
+	prompt, _ := ctx.RequestBody["prompt"].(string)
+
+	// 1. Build the set of available endpoints.
+	available := make(map[string]struct{}, len(scoredPods))
+	for _, sp := range scoredPods {
+		ep := sp.GetPod().EndpointURL // <-- adapt this accessor
+		available[ep] = struct{}{}
+	}
+
+	// 2. Longest-prefix match within the trie.
+	matched := p.trie.longestPrefixMatch(prompt, available)
+
+	// 3. Fallback: no match --> all endpoints are candidates.
+	if len(matched) == 0 {
+		for ep := range available {
+			matched[ep] = struct{}{}
+		}
+	}
+
+	// 4. Convert the matched set to a slice and pick randomly.
+	endpoints := make([]string, 0, len(matched))
+	for ep := range matched {
+		endpoints = append(endpoints, ep)
+	}
+	selected := endpoints[p.rnd.Intn(len(endpoints))]
+
+	// 5. Cache the decision for future prefix look-ups.
+	p.trie.insert(prompt, selected)
+
+	// 6. Return the pod whose URL matches `selected`.
+	for _, sp := range scoredPods {
+		if sp.GetPod().EndpointURL == selected { // same accessor as above
+			return &types.Result{TargetPod: sp}
+		}
+	}
+	// Should never hit; safe fallback.
+	return &types.Result{TargetPod: scoredPods[0]}
+}
+
+/*---------------------------- trie implementation ---------------------------*/
+
+type hashTrie struct {
+	mu        sync.RWMutex
+	children  map[rune]*hashTrie
+	endpoints map[string]struct{}
+}
+
+func newHashTrie() *hashTrie {
+	return &hashTrie{children: make(map[rune]*hashTrie)}
+}
+
+func (t *hashTrie) insert(key, endpoint string) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	node := t
+	for _, r := range key {
+		child, ok := node.children[r]
+		if !ok {
+			child = newHashTrie()
+			node.children[r] = child
+		}
+		node = child
+	}
+	if node.endpoints == nil {
+		node.endpoints = make(map[string]struct{})
+	}
+	node.endpoints[endpoint] = struct{}{}
+}
+
+func (t *hashTrie) longestPrefixMatch(
+	key string,
+	available map[string]struct{},
+) map[string]struct{} {
+	t.mu.RLock()
+	defer t.mu.RUnlock()
+
+	var lastMatch map[string]struct{}
+	node := t
+	for _, r := range key {
+		if node.endpoints != nil {
+			lastMatch = node.endpoints
+		}
+		child, ok := node.children[r]
+		if !ok {
+			break
+		}
+		node = child
+	}
+	// Filter by `available`.
+	res := make(map[string]struct{})
+	for ep := range lastMatch {
+		if _, ok := available[ep]; ok {
+			res[ep] = struct{}{}
+		}
+	}
+	return res
+}
diff --git a/src/gateway_inference_extension/scheduler.patch b/src/gateway_inference_extension/scheduler.patch
@@ -16,6 +16,7 @@ index b484cde..c7688a8 100644
  		scorers:             map[plugins.Scorer]int{},
 -		picker:              &picker.RandomPicker{},
 +		picker:              &picker.RoundRobinPicker{},
++       picker:              &picker.PrefixAwarePicker{},
  		postSchedulePlugins: []plugins.PostSchedule{},
  	}