Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/gateway_inference_extension/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ RUN git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension
git apply scheduler.patch && \
cd ../../../.. && \
cp /src/roundrobin_picker.go gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker/roundrobin_picker.go && \
cp /src/prefix_aware_picker.go gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker/prefix_aware_picker.go && \
mkdir -p /src/pkg/ && \
cp -r gateway-api-inference-extension/pkg/epp/ /src/pkg/epp && \
cp gateway-api-inference-extension/go.mod /src && \
Expand Down
100 changes: 100 additions & 0 deletions src/gateway_inference_extension/configs/vllm/vllm-runtime.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
apiVersion: production-stack.vllm.ai/v1alpha1
kind: VLLMRuntime
metadata:
name: vllm-llama3-1b-instruct # must match Gateway route back-reference
labels:
app.kubernetes.io/component: model-server
app.kubernetes.io/part-of: inference-gateway-demo
spec:
# --- Core vLLM flags --------------------------------------------------
v1: true # keep vLLM v1 API surface on
tensorParallelSize: 1
gpuMemoryUtilization: "0.9"
maxLoras: 2
extraArgs:
- "--max-num-seq" # identical to old Deployment
- "1024"
- "--compilation-config"
- "3"
- "--max-lora-rank"
- "32"
- "--max-cpu-loras"
- "12"

# --- Model ------------------------------------------------------------
model:
modelURL: "meta-llama/Llama-3.2-1B-Instruct"
enableLoRA: true
dtype: "bfloat16"
maxModelLen: 4096
maxNumSeqs: 1024 # duplicated for clarity

# --- LoRA & cache off-loading ----------------------------------------
lmCacheConfig:
enabled: true
remoteUrl: "lm://cacheserver-sample.default.svc.cluster.local:80"
remoteSerde: "naive"
cpuOffloadingBufferSize: "15"
diskOffloadingBufferSize: "0"

# --- Runtime image ----------------------------------------------------
image:
registry: "docker.io"
name: "lmcache/vllm-openai:2025-05-05-v1"
pullPolicy: "IfNotPresent"

# --- Resources --------------------------------------------------------
resources:
cpu: "8"
memory: "24Gi"
gpu: "1"

# --- Secret & env -----------------------------------------------------
hfTokenSecret:
name: "hf-token"
env:
- name: VLLM_USE_V1
value: "1"
- name: PORT
value: "8000"
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
value: "true"

# --- Replication & strategy ------------------------------------------
replicas: 2
deploymentStrategy: "RollingUpdate"

# --- Pod-level customisation (for adapter syncer & volumes) ----------
podTemplate:
spec:
enableServiceLinks: false # avoid VLLM_PORT collision
terminationGracePeriodSeconds: 130
volumes:
- name: data
emptyDir: {}
- name: shm
emptyDir:
medium: Memory
- name: adapters
emptyDir: {}
- name: config-volume
configMap:
name: vllm-llama3-1b-instruct-adapters
initContainers:
- name: lora-adapter-syncer
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main
env:
- name: DYNAMIC_LORA_ROLLOUT_CONFIG
value: "/config/configmap.yaml"
volumeMounts:
- name: config-volume
mountPath: /config
containers:
- name: vllm
volumeMounts:
- name: data
mountPath: /data
- name: shm
mountPath: /dev/shm
- name: adapters
mountPath: /adapters
152 changes: 152 additions & 0 deletions src/gateway_inference_extension/prefix_aware_picker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
Copyright 2025 The vLLM Production Stack Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0
*/

package picker

import (
"math/rand"
"sync"
"time"

"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
)

var _ plugins.Picker = &PrefixMatchPicker{}

// PrefixMatchPicker selects the engine whose URL was returned by the
// longest-prefix match against previously-seen prompts (same idea as the
// Python `route_request`). Ties are broken at random.
type PrefixMatchPicker struct {
trie *hashTrie
rnd *rand.Rand
}

// NewPrefixMatchPicker returns a ready-to-use picker instance.
func NewPrefixMatchPicker() *PrefixMatchPicker {
return &PrefixMatchPicker{
trie: newHashTrie(),
rnd: rand.New(rand.NewSource(time.Now().UnixNano())),
}
}

func (p *PrefixMatchPicker) Name() string { return "prefixmatch" }

// Pick implements plugins.Picker.
//
// SchedulingContext is assumed to carry the inference request body in
// ctx.RequestBody (map[string]any) with the prompt at key "prompt". Adjust
// the accessor if your integration differs.
func (p *PrefixMatchPicker) Pick(
ctx *types.SchedulingContext,
scoredPods []*types.ScoredPod,
) *types.Result {
if len(scoredPods) == 0 {
return &types.Result{}
}

prompt, _ := ctx.RequestBody["prompt"].(string)

// 1. Build the set of available endpoints.
available := make(map[string]struct{}, len(scoredPods))
for _, sp := range scoredPods {
ep := sp.GetPod().EndpointURL // <-- adapt this accessor
available[ep] = struct{}{}
}

// 2. Longest-prefix match within the trie.
matched := p.trie.longestPrefixMatch(prompt, available)

// 3. Fallback: no match --> all endpoints are candidates.
if len(matched) == 0 {
for ep := range available {
matched[ep] = struct{}{}
}
}

// 4. Convert the matched set to a slice and pick randomly.
endpoints := make([]string, 0, len(matched))
for ep := range matched {
endpoints = append(endpoints, ep)
}
selected := endpoints[p.rnd.Intn(len(endpoints))]

// 5. Cache the decision for future prefix look-ups.
p.trie.insert(prompt, selected)

// 6. Return the pod whose URL matches `selected`.
for _, sp := range scoredPods {
if sp.GetPod().EndpointURL == selected { // same accessor as above
return &types.Result{TargetPod: sp}
}
}
// Should never hit; safe fallback.
return &types.Result{TargetPod: scoredPods[0]}
}

/*---------------------------- trie implementation ---------------------------*/

type hashTrie struct {
mu sync.RWMutex
children map[rune]*hashTrie
endpoints map[string]struct{}
}

func newHashTrie() *hashTrie {
return &hashTrie{children: make(map[rune]*hashTrie)}
}

func (t *hashTrie) insert(key, endpoint string) {
t.mu.Lock()
defer t.mu.Unlock()

node := t
for _, r := range key {
child, ok := node.children[r]
if !ok {
child = newHashTrie()
node.children[r] = child
}
node = child
}
if node.endpoints == nil {
node.endpoints = make(map[string]struct{})
}
node.endpoints[endpoint] = struct{}{}
}

func (t *hashTrie) longestPrefixMatch(
key string,
available map[string]struct{},
) map[string]struct{} {
t.mu.RLock()
defer t.mu.RUnlock()

var lastMatch map[string]struct{}
node := t
for _, r := range key {
if node.endpoints != nil {
lastMatch = node.endpoints
}
child, ok := node.children[r]
if !ok {
break
}
node = child
}
// Filter by `available`.
res := make(map[string]struct{})
for ep := range lastMatch {
if _, ok := available[ep]; ok {
res[ep] = struct{}{}
}
}
return res
}
1 change: 1 addition & 0 deletions src/gateway_inference_extension/scheduler.patch
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ index b484cde..c7688a8 100644
scorers: map[plugins.Scorer]int{},
- picker: &picker.RandomPicker{},
+ picker: &picker.RoundRobinPicker{},
+ picker: &picker.PrefixAwarePicker{},
postSchedulePlugins: []plugins.PostSchedule{},
}

Expand Down