@@ -177,7 +177,6 @@ spec:
177177      terminationGracePeriodSeconds : 130 
178178      nodeSelector :
179179        cloud.google.com/gke-accelerator : " nvidia-h100-80gb" 
180- 
181180      volumes :
182181        - name : data 
183182          emptyDir : {} 
@@ -250,40 +249,133 @@ spec:
250249    spec :
251250      terminationGracePeriodSeconds : 130 
252251      containers :
253-         - name : epp 
254-           image : us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main 
255-           imagePullPolicy : Always 
256-           args :
257-             - -poolName 
258-             - " vllm-llama3-8b-instruct-new" 
259-             - " -poolNamespace" 
260-             - " default" 
261-             - -v 
262-             - " 4" 
263-             - --zap-encoder 
264-             - " json" 
265-             - -grpcPort 
266-             - " 9002" 
267-             - -grpcHealthPort 
268-             - " 9003" 
269-           ports :
270-             - containerPort : 9002 
271-             - containerPort : 9003 
272-             - name : metrics 
273-               containerPort : 9090 
274-           livenessProbe :
275-             grpc :
276-               port : 9003 
277-               service : inference-extension 
278-             initialDelaySeconds : 5 
279-             periodSeconds : 10 
280-           readinessProbe :
281-             grpc :
282-               port : 9003 
283-               service : inference-extension 
284-             initialDelaySeconds : 5 
285-             periodSeconds : 10 
286-   EOF 
252+       - name : epp 
253+         image : us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main 
254+         imagePullPolicy : Always 
255+         args :
256+         - -poolName 
257+         - " vllm-llama3-8b-instruct-new" 
258+         - -poolNamespace 
259+         - " default" 
260+         - -v 
261+         - " 4" 
262+         - --zap-encoder 
263+         - " json" 
264+         - -grpcPort 
265+         - " 9002" 
266+         - -grpcHealthPort 
267+         - " 9003" 
268+         - -configFile 
269+         - " /config/default-plugins.yaml" 
270+         ports :
271+         - containerPort : 9002 
272+           name : grpc 
273+         - containerPort : 9003 
274+           name : grpc-health 
275+         - containerPort : 9090 
276+           name : metrics 
277+         livenessProbe :
278+           grpc :
279+             port : 9003 
280+             service : inference-extension 
281+           initialDelaySeconds : 5 
282+           periodSeconds : 10 
283+         readinessProbe :
284+           grpc :
285+             port : 9003 
286+             service : inference-extension 
287+           initialDelaySeconds : 5 
288+           periodSeconds : 10 
289+         volumeMounts :
290+         - name : plugins-config-volume 
291+           mountPath : /config 
292+       volumes :
293+       - name : plugins-config-volume 
294+         configMap :
295+           name : plugins-config 
296+ ---
297+ apiVersion : v1 
298+ kind : ConfigMap 
299+ metadata :
300+   name : plugins-config 
301+   namespace : default 
302+ data :
303+   default-plugins.yaml : | 
304+     apiVersion: inference.networking.x-k8s.io/v1alpha1 
305+     kind: EndpointPickerConfig 
306+     plugins: 
307+     - type: low-queue-filter 
308+       parameters: 
309+         threshold: 128 
310+     - type: lora-affinity-filter 
311+       parameters: 
312+         threshold: 0.999 
313+     - type: least-queue-filter 
314+     - type: least-kv-cache-filter 
315+     - type: decision-tree-filter 
316+       name: low-latency-filter 
317+       parameters: 
318+         current: 
319+           pluginRef: low-queue-filter 
320+         nextOnSuccess: 
321+           decisionTree: 
322+             current: 
323+               pluginRef: lora-affinity-filter 
324+             nextOnSuccessOrFailure: 
325+               decisionTree: 
326+                 current: 
327+                   pluginRef: least-queue-filter 
328+                 nextOnSuccessOrFailure: 
329+                   decisionTree: 
330+                     current: 
331+                       pluginRef: least-kv-cache-filter 
332+         nextOnFailure: 
333+           decisionTree: 
334+             current: 
335+               pluginRef: least-queue-filter 
336+             nextOnSuccessOrFailure: 
337+               decisionTree: 
338+                 current: 
339+                   pluginRef: lora-affinity-filter 
340+                 nextOnSuccessOrFailure: 
341+                   decisionTree: 
342+                     current: 
343+                       pluginRef: least-kv-cache-filter 
344+     - type: random-picker 
345+       parameters: 
346+         maxNumOfEndpoints: 1 
347+     - type: single-profile-handler 
348+     schedulingProfiles: 
349+     - name: default 
350+       plugins: 
351+       - pluginRef: low-latency-filter 
352+       - pluginRef: random-picker 
353+ plugins-v2.yaml : | 
354+     apiVersion: inference.networking.x-k8s.io/v1alpha1 
355+     kind: EndpointPickerConfig 
356+     plugins: 
357+     - type: queue-scorer 
358+     - type: kv-cache-scorer 
359+     - type: prefix-cache-scorer 
360+       parameters: 
361+         hashBlockSize: 64 
362+         maxPrefixBlocksToMatch: 256 
363+         lruCapacityPerServer: 31250 
364+     - type: max-score-picker 
365+       parameters: 
366+         maxNumOfEndpoints: 1 
367+     - type: single-profile-handler 
368+     schedulingProfiles: 
369+     - name: default 
370+       plugins: 
371+       - pluginRef: queue-scorer 
372+         weight: 1 
373+       - pluginRef: kv-cache-scorer 
374+         weight: 1 
375+       - pluginRef: prefix-cache-scorer 
376+         weight: 1 
377+       - pluginRef: max-score-picker 
378+ EOF 
287379``` 
288380
289381### Direct traffic to the new inference pool  
0 commit comments