From c98ab6a2e642863671f297f040d02170b499e12e Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Tue, 15 Jul 2025 17:58:36 -0700
Subject: [PATCH 01/19] adding raw metrics name conversion in metrics

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go | 32 ++++++++++++++++---
 pkg/metrics/metrics.go     | 64 ++++++++++++++++++++++++++++++++++++--
 pkg/metrics/types.go       | 13 ++++----
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index c1caa1c78..be5b36174 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -33,6 +33,7 @@ const (
 	// the request port and metrics port may differ, so a dedicated metrics port is required.
 	MetricPortLabel                     = "model.aibrix.ai/metric-port"
 	defaultPodMetricRefreshIntervalInMS = 50
+	engineLabel                         = "model.aibrix.ai/engine"
 )
 
 var (
@@ -173,7 +174,7 @@ func (c *Store) updateSimpleMetricFromRawMetrics(pod *Pod, allMetrics map[string
 		}
 
 		// TODO: we should refact metricName to fit other engine
-		metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", metricName)]
+		metricFamily, exists := c.fetchMetrics(pod, allMetrics, metricName)
 		if !exists {
 			klog.V(4).Infof("Cannot find %v in the pod metrics", metricName)
 			continue
@@ -208,8 +209,7 @@ func (c *Store) updateHistogramMetricFromRawMetrics(pod *Pod, allMetrics map[str
 			klog.V(4).Infof("Cannot find %v in the metric list", metricName)
 			continue
 		}
-
-		metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", metricName)]
+		metricFamily, exists := c.fetchMetrics(pod, allMetrics, metricName)
 		if !exists {
 			klog.V(4).Infof("Cannot find %v in the pod metrics", metricName)
 			continue
@@ -250,7 +250,7 @@ func (c *Store) updateQueryLabelMetricFromRawMetrics(pod *Pod, allMetrics map[st
 		}
 		rawMetricName := metric.RawMetricName
 		scope := metric.MetricScope
-		metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", rawMetricName)]
+		metricFamily, exists := c.fetchMetrics(pod, allMetrics, rawMetricName)
 		if !exists {
 			klog.V(4).Infof("Cannot find %v in the pod metrics", rawMetricName)
 			continue
@@ -330,6 +330,30 @@ func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Met
 	return nil
 }
 
+func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, labelMetricName string) (*dto.MetricFamily, bool) {
+	metric, exists := metrics.Metrics[labelMetricName]
+	if !exists {
+		klog.V(4).Infof("Cannot find %v in the metric list", labelMetricName)
+		return nil, false
+	}
+	engineType, ok := pod.Labels[engineLabel]
+	if !ok {
+		klog.V(4).InfoS("No engine label, default to vllm", "name", pod.Name)
+		engineType = "vllm"
+	}
+	rawMetricName, ok := metric.RawMetricNameMapping[engineType]
+	if !ok {
+		klog.V(4).Infof("Cannot find %v in the metric list, engine type %v", labelMetricName, engineType)
+		return nil, false
+	}
+	metricFamily, exists := allMetrics[rawMetricName]
+	if !exists {
+		klog.V(4).Infof("Cannot find raw metrics name %v, engine type %v", rawMetricName, engineType)
+		return nil, false
+	}
+	return metricFamily, true
+}
+
 // Update `PodMetrics` and `PodModelMetrics` according to the metric scope
 // TODO: replace in-place metric update podMetrics and podModelMetrics to fresh copy for preventing stale metric keys
 func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, scope metrics.MetricScope, metricValue metrics.MetricValue) error {
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index 8a7d82a65..d4acd4ac5 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -63,6 +63,10 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm":   "vllm:num_requests_running",
+				"sglang": "sglang:num_running_reqs",
+			},
 			Description: "Number of running requests",
 		},
 		NumRequestsWaiting: {
@@ -71,6 +75,9 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:num_requests_waiting",
+			},
 			Description: "Number of waiting requests",
 		},
 		NumRequestsSwapped: {
@@ -79,6 +86,9 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:num_requests_swapped",
+			},
 			Description: "Number of swapped requests",
 		},
 		// Gauge metrics
@@ -88,6 +98,9 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:avg_prompt_throughput_toks_per_s",
+			},
 			Description: "Average prompt throughput in tokens per second",
 		},
 		AvgGenerationThroughputToksPerS: {
@@ -96,6 +109,9 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:avg_generation_throughput_toks_per_s",
+			},
 			Description: "Average generation throughput in tokens per second",
 		},
 		// Histogram metrics
@@ -105,6 +121,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:iteration_tokens_total",
+			},
 			Description: "Total iteration tokens",
 		},
 		TimeToFirstTokenSeconds: {
@@ -113,6 +132,10 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm":   "vllm:time_to_first_token_seconds",
+				"sglang": "vllm:sglang:time_to_first_token_seconds",
+			},
 			Description: "Time to first token in seconds",
 		},
 		TimePerOutputTokenSeconds: {
@@ -121,6 +144,10 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm":   "vllm:time_per_output_token_seconds",
+				"sglang": "sglang:inter_token_latency_seconds",
+			},
 			Description: "Time per output token in seconds",
 		},
 		E2ERequestLatencySeconds: {
@@ -129,6 +156,10 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm":   "vllm:e2e_request_latency_seconds",
+				"sglang": "sglang:e2e_request_latency_seconds",
+			},
 			Description: "End-to-end request latency in seconds",
 		},
 		RequestQueueTimeSeconds: {
@@ -137,6 +168,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:request_queue_time_seconds",
+			},
 			Description: "Request queue time in seconds",
 		},
 		RequestInferenceTimeSeconds: {
@@ -145,6 +179,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:request_inference_time_seconds",
+			},
 			Description: "Request inference time in seconds",
 		},
 		RequestDecodeTimeSeconds: {
@@ -153,6 +190,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:request_decode_time_seconds",
+			},
 			Description: "Request decode time in seconds",
 		},
 		RequestPrefillTimeSeconds: {
@@ -161,6 +201,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:request_prefill_time_seconds",
+			},
 			Description: "Request prefill time in seconds",
 		},
 		// Query-based metrics
@@ -233,6 +276,9 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:gpu_cache_usage_perc",
+			},
 			Description: "GPU cache usage percentage",
 		},
 		CPUCacheUsagePerc: {
@@ -241,6 +287,9 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:cpu_cache_usage_perc",
+			},
 			Description: "CPU cache usage percentage",
 		},
 		AvgE2ELatencyPod: {
@@ -286,7 +335,10 @@ var (
 				Query: QueryLabel,
 			},
 			RawMetricName: "lora_requests_info",
-			Description:   "Max count of Lora Adapters",
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:max_lora",
+			},
+			Description: "Max count of Lora Adapters",
 		},
 		RunningLoraAdapters: {
 			MetricScope:  PodMetricScope,
@@ -295,7 +347,10 @@ var (
 				Query: QueryLabel,
 			},
 			RawMetricName: "lora_requests_info",
-			Description:   "Count of running Lora Adapters",
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:running_lora_adapters",
+			},
+			Description: "Count of running Lora Adapters",
 		},
 		WaitingLoraAdapters: {
 			MetricScope:  PodMetricScope,
@@ -304,7 +359,10 @@ var (
 				Query: QueryLabel,
 			},
 			RawMetricName: "lora_requests_info",
-			Description:   "Count of waiting Lora Adapters",
+			RawMetricNameMapping: map[string]string{
+				"vllm": "vllm:waiting_lora_adapters",
+			},
+			Description: "Count of waiting Lora Adapters",
 		},
 		VTCBucketSizeActive: {
 			MetricScope:  PodModelMetricScope,
diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go
index 1e41eb297..5ca40bf11 100644
--- a/pkg/metrics/types.go
+++ b/pkg/metrics/types.go
@@ -77,12 +77,13 @@ const (
 
 // Metric defines a unique metric with metadata.
 type Metric struct {
-	MetricSource  MetricSource
-	MetricType    MetricType
-	PromQL        string // Optional: Only applicable for PromQL-based metrics
-	RawMetricName string // Optional: Only applicable for QueryLabel-based metrics
-	Description   string
-	MetricScope   MetricScope
+	MetricSource         MetricSource
+	MetricType           MetricType
+	PromQL               string // Optional: Only applicable for PromQL-based metrics
+	RawMetricName        string // Optional: Only applicable for QueryLabel-based metrics
+	RawMetricNameMapping map[string]string
+	Description          string
+	MetricScope          MetricScope
 }
 
 // MetricValue is the interface for all metric values.

From 42a5d75beef6822012e4a71a542929fa5ee26168 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Sun, 20 Jul 2025 20:21:58 -0700
Subject: [PATCH 02/19] adding policy least gpu/util

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go                    |  1 +
 pkg/metrics/metrics.go                        | 15 ++-
 .../gateway/algorithms/least_gpu_cache.go     | 92 +++++++++++++++++++
 pkg/plugins/gateway/algorithms/least_util.go  | 81 ++++++++++++++++
 4 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 pkg/plugins/gateway/algorithms/least_gpu_cache.go
 create mode 100644 pkg/plugins/gateway/algorithms/least_util.go

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index be5b36174..a15b22245 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -45,6 +45,7 @@ var (
 		metrics.AvgGenerationThroughputToksPerS,
 		metrics.GPUCacheUsagePerc,
 		metrics.CPUCacheUsagePerc,
+		metrics.GPUBusyTimeRatio,
 	}
 
 	// histogram metric example - time_to_first_token_seconds, _sum, _bucket _count.
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index d4acd4ac5..8e91405eb 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -277,10 +277,23 @@ var (
 				Raw: Counter,
 			},
 			RawMetricNameMapping: map[string]string{
-				"vllm": "vllm:gpu_cache_usage_perc",
+				"vllm":   "vllm:gpu_cache_usage_perc",
+				"sglang": "sglang:token_usage",
+				"xllm":   "kv_cache_utilization",
 			},
 			Description: "GPU cache usage percentage",
 		},
+		GPUBusyTimeRatio: {
+			MetricScope:  PodModelMetricScope,
+			MetricSource: PodRawMetrics,
+			MetricType: MetricType{
+				Raw: Gauge,
+			},
+			RawMetricNameMapping: map[string]string{
+				"xllm": "engine_utilization",
+			},
+			Description: "GPU busy time ratio",
+		},
 		CPUCacheUsagePerc: {
 			MetricScope:  PodModelMetricScope,
 			MetricSource: PodRawMetrics,
diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
new file mode 100644
index 000000000..cf9686124
--- /dev/null
+++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
@@ -0,0 +1,92 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package routingalgorithms
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+
+	"github.com/vllm-project/aibrix/pkg/cache"
+	metrics "github.com/vllm-project/aibrix/pkg/metrics"
+	"github.com/vllm-project/aibrix/pkg/types"
+	v1 "k8s.io/api/core/v1"
+	klog "k8s.io/klog/v2"
+)
+
+const RouterLeastGpuCache types.RoutingAlgorithm = "least-gpu-cache"
+
+func init() {
+	Register(RouterLeastGpuCache, NewLeastGpuCacheRouter)
+}
+
+type leastGpuCacheRouter struct {
+	cache cache.Cache
+}
+
+func NewLeastGpuCacheRouter() (types.Router, error) {
+	c, err := cache.Get()
+	if err != nil {
+		return nil, err
+	}
+
+	return leastGpuCacheRouter{
+		cache: c,
+	}, nil
+}
+
+func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
+	var targetPod *v1.Pod
+	minGpuCache := math.MaxFloat64
+
+	for _, pod := range readyPodList.All() {
+		// Due to metric refactor (pull/543) to better support lora and multi models,
+		// we change to use PodModelMetrics instead of PodMetrics in some scenarios.
+		// This works but doesn't look very promising, we can revisit this part later.
+		gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUCacheUsagePerc)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		totalCache := gpuCache.GetSimpleValue()
+
+		klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v",
+			pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue())
+
+		if totalCache <= minGpuCache {
+			minGpuCache = totalCache
+			targetPod = pod
+		}
+	}
+
+	// Use fallback if no valid metrics
+	if targetPod == nil {
+		var err error
+		targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	if targetPod == nil {
+		return "", fmt.Errorf("no pods to forward request")
+	}
+
+	klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
+	ctx.SetTargetPod(targetPod)
+	return ctx.TargetAddress(), nil
+}
diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go
new file mode 100644
index 000000000..c0201404a
--- /dev/null
+++ b/pkg/plugins/gateway/algorithms/least_util.go
@@ -0,0 +1,81 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package routingalgorithms
+
+import (
+	"math"
+	"math/rand"
+
+	"github.com/vllm-project/aibrix/pkg/cache"
+	"github.com/vllm-project/aibrix/pkg/metrics"
+	"github.com/vllm-project/aibrix/pkg/types"
+	v1 "k8s.io/api/core/v1"
+	klog "k8s.io/klog/v2"
+)
+
+const RouterUtil types.RoutingAlgorithm = "least-util"
+
+func init() {
+	Register(RouterUtil, NewLeastUtilRouter)
+}
+
+type leastUtilRouter struct {
+	cache cache.Cache
+}
+
+func NewLeastUtilRouter() (types.Router, error) {
+	c, err := cache.Get()
+	if err != nil {
+		return nil, err
+	}
+
+	return leastUtilRouter{
+		cache: c,
+	}, nil
+}
+
+func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
+	var targetPod *v1.Pod
+	minBusyTimeRatio := math.MaxFloat64 // <= 1 in general
+
+	for _, pod := range readyPodList.All() {
+		busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		busyTimeRatioValue := busyTimeRatio.GetSimpleValue()
+		klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
+
+		if busyTimeRatioValue < minBusyTimeRatio {
+			minBusyTimeRatio = busyTimeRatioValue
+			targetPod = pod
+		}
+	}
+
+	// Use fallback if no valid metrics
+	if targetPod == nil {
+		var err error
+		targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	ctx.SetTargetPod(targetPod)
+	return ctx.TargetAddress(), nil
+}

From 9ad913ea5927e1ab5b47f6f13c3482adb874d755 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Mon, 21 Jul 2025 15:42:06 -0700
Subject: [PATCH 03/19] enhance log for debugging purposes

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/plugins/gateway/algorithms/least_busy_time.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go
index 56ced9bce..ff8b085eb 100644
--- a/pkg/plugins/gateway/algorithms/least_busy_time.go
+++ b/pkg/plugins/gateway/algorithms/least_busy_time.go
@@ -59,11 +59,12 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 			continue
 		}
 		busyTimeRatioValue := busyTimeRatio.GetSimpleValue()
-		klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
+		klog.Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
 
 		if busyTimeRatioValue < minBusyTimeRatio {
 			minBusyTimeRatio = busyTimeRatioValue
 			targetPod = pod
+			klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
 		}
 	}
 
@@ -74,6 +75,7 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 		if err != nil {
 			return "", err
 		}
+		klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP)
 	}
 
 	ctx.SetTargetPod(targetPod)

From b164150304c52d2c970eb2d8257b779deabadd3c Mon Sep 17 00:00:00 2001
From: happyandslow <le.xu@bytedance.com>
Date: Mon, 21 Jul 2025 22:51:30 +0000
Subject: [PATCH 04/19] enhance log for debugging purposes

Signed-off-by: root <root@bytedance>
Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/plugins/gateway/algorithms/least_busy_time.go | 4 +++-
 pkg/plugins/gateway/algorithms/least_gpu_cache.go | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go
index ff8b085eb..ee806c482 100644
--- a/pkg/plugins/gateway/algorithms/least_busy_time.go
+++ b/pkg/plugins/gateway/algorithms/least_busy_time.go
@@ -64,7 +64,6 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 		if busyTimeRatioValue < minBusyTimeRatio {
 			minBusyTimeRatio = busyTimeRatioValue
 			targetPod = pod
-			klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
 		}
 	}
 
@@ -77,6 +76,9 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 		}
 		klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP)
 	}
+	else{
+		klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, minBusyTimeRatio)
+	}
 
 	ctx.SetTargetPod(targetPod)
 	return ctx.TargetAddress(), nil
diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
index cf9686124..2a75278c7 100644
--- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go
+++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
@@ -64,7 +64,7 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types
 		}
 		totalCache := gpuCache.GetSimpleValue()
 
-		klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v",
+		klog.Infof("pod: %v, podIP: %v, gpuCache: %v",
 			pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue())
 
 		if totalCache <= minGpuCache {
@@ -80,13 +80,17 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types
 		if err != nil {
 			return "", err
 		}
+		klog.Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
+	}
+	else{
+		klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, gpuCache.GetSimpleValue())
 	}
 
 	if targetPod == nil {
 		return "", fmt.Errorf("no pods to forward request")
 	}
 
-	klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
+	klog.Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
 	ctx.SetTargetPod(targetPod)
 	return ctx.TargetAddress(), nil
 }

From 4798279c117aa7c21931bed229532fe988538fc3 Mon Sep 17 00:00:00 2001
From: happyandslow <le.xu@bytedance.com>
Date: Mon, 21 Jul 2025 22:59:34 +0000
Subject: [PATCH 05/19] nhance log for debugging purposes v0.3.0-metrics-3

Signed-off-by: happyandslow <le.xu@bytedance.com>
Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/plugins/gateway/algorithms/least_busy_time.go | 5 ++---
 pkg/plugins/gateway/algorithms/least_gpu_cache.go | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go
index ee806c482..fea73cf25 100644
--- a/pkg/plugins/gateway/algorithms/least_busy_time.go
+++ b/pkg/plugins/gateway/algorithms/least_busy_time.go
@@ -75,9 +75,8 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 			return "", err
 		}
 		klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP)
-	}
-	else{
-		klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, minBusyTimeRatio)
+	} else {
+		klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio)
 	}
 
 	ctx.SetTargetPod(targetPod)
diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
index 2a75278c7..f7ceee83d 100644
--- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go
+++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
@@ -81,9 +81,8 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types
 			return "", err
 		}
 		klog.Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
-	}
-	else{
-		klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, gpuCache.GetSimpleValue())
+	} else {
+		klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache)
 	}
 
 	if targetPod == nil {

From 080d3e990e7f9662da8dc07617f941e81698bc37 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Tue, 22 Jul 2025 10:16:15 -0700
Subject: [PATCH 06/19] enhance logs

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index a15b22245..4292e78d0 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -34,6 +34,7 @@ const (
 	MetricPortLabel                     = "model.aibrix.ai/metric-port"
 	defaultPodMetricRefreshIntervalInMS = 50
 	engineLabel                         = "model.aibrix.ai/engine"
+	defaultEngineLabel                  = "vllm"
 )
 
 var (
@@ -334,22 +335,22 @@ func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Met
 func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, labelMetricName string) (*dto.MetricFamily, bool) {
 	metric, exists := metrics.Metrics[labelMetricName]
 	if !exists {
-		klog.V(4).Infof("Cannot find %v in the metric list", labelMetricName)
+		klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName)
 		return nil, false
 	}
 	engineType, ok := pod.Labels[engineLabel]
 	if !ok {
-		klog.V(4).InfoS("No engine label, default to vllm", "name", pod.Name)
-		engineType = "vllm"
+		klog.V(4).Infof("No engine label pod %v, default to %v", pod.Name, defaultEngineLabel)
+		engineType = defaultEngineLabel
 	}
 	rawMetricName, ok := metric.RawMetricNameMapping[engineType]
 	if !ok {
-		klog.V(4).Infof("Cannot find %v in the metric list, engine type %v", labelMetricName, engineType)
+		klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
 		return nil, false
 	}
 	metricFamily, exists := allMetrics[rawMetricName]
 	if !exists {
-		klog.V(4).Infof("Cannot find raw metrics name %v, engine type %v", rawMetricName, engineType)
+		klog.V(4).Infof("Cannot find raw metrics %v, engine type %v", rawMetricName, engineType)
 		return nil, false
 	}
 	return metricFamily, true

From db273edc235ca26221f514067760e3e19c48388e Mon Sep 17 00:00:00 2001
From: Le Xu <lexuatwork@gmail.com>
Date: Tue, 22 Jul 2025 09:57:52 -0700
Subject: [PATCH 07/19] Update pkg/metrics/metrics.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Le Xu <lexuatwork@gmail.com>
---
 pkg/metrics/metrics.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index 8e91405eb..5c48a9822 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -134,7 +134,7 @@ var (
 			},
 			RawMetricNameMapping: map[string]string{
 				"vllm":   "vllm:time_to_first_token_seconds",
-				"sglang": "vllm:sglang:time_to_first_token_seconds",
+				"sglang": "sglang:time_to_first_token_seconds",
 			},
 			Description: "Time to first token in seconds",
 		},

From 6f05475d609650e36ab28b4e4d1427f43370ef2f Mon Sep 17 00:00:00 2001
From: Le Xu <lexuatwork@gmail.com>
Date: Tue, 22 Jul 2025 10:17:46 -0700
Subject: [PATCH 08/19] Update pkg/metrics/types.go

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Le Xu <lexuatwork@gmail.com>
---
 pkg/metrics/types.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go
index 5ca40bf11..46bff8bb3 100644
--- a/pkg/metrics/types.go
+++ b/pkg/metrics/types.go
@@ -81,7 +81,7 @@ type Metric struct {
 	MetricType           MetricType
 	PromQL               string // Optional: Only applicable for PromQL-based metrics
 	RawMetricName        string // Optional: Only applicable for QueryLabel-based metrics
-	RawMetricNameMapping map[string]string
+	RawMetricNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
 	Description          string
 	MetricScope          MetricScope
 }

From 1b87c39da01c469ea4fa7215c8a0f177145b4748 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Tue, 22 Jul 2025 10:24:25 -0700
Subject: [PATCH 09/19] update format

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/metrics/types.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go
index 46bff8bb3..365f46787 100644
--- a/pkg/metrics/types.go
+++ b/pkg/metrics/types.go
@@ -79,8 +79,8 @@ const (
 type Metric struct {
 	MetricSource         MetricSource
 	MetricType           MetricType
-	PromQL               string // Optional: Only applicable for PromQL-based metrics
-	RawMetricName        string // Optional: Only applicable for QueryLabel-based metrics
+	PromQL               string            // Optional: Only applicable for PromQL-based metrics
+	RawMetricName        string            // Optional: Only applicable for QueryLabel-based metrics
 	RawMetricNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
 	Description          string
 	MetricScope          MetricScope

From 3c294c6db3ea78b5b040df07c18fdcbc340e260a Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Tue, 22 Jul 2025 22:40:20 -0700
Subject: [PATCH 10/19] fix missing modelname/pod

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index 4292e78d0..0d05bc12c 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -34,7 +34,10 @@ const (
 	MetricPortLabel                     = "model.aibrix.ai/metric-port"
 	defaultPodMetricRefreshIntervalInMS = 50
 	engineLabel                         = "model.aibrix.ai/engine"
+	portLabel                           = "model.aibrix.ai/port"
+	modelLabel                          = "model.aibrix.ai/name"
 	defaultEngineLabel                  = "vllm"
+	defaultModelName                    = ""
 )
 
 var (
@@ -237,7 +240,6 @@ func (c *Store) updateHistogramMetricFromRawMetrics(pod *Pod, allMetrics map[str
 			}
 
 			klog.V(5).InfoS("Successfully parsed metrics", "metric", metricName, "model", modelName, "PodIP", pod.Status.PodIP, "Port", podMetricPort, "metricValue", metricValue)
-
 		}
 	}
 }
@@ -338,11 +340,7 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
 		klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName)
 		return nil, false
 	}
-	engineType, ok := pod.Labels[engineLabel]
-	if !ok {
-		klog.V(4).Infof("No engine label pod %v, default to %v", pod.Name, defaultEngineLabel)
-		engineType = defaultEngineLabel
-	}
+	engineType := getPodLabel(pod, engineLabel, defaultEngineLabel)
 	rawMetricName, ok := metric.RawMetricNameMapping[engineType]
 	if !ok {
 		klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
@@ -363,7 +361,10 @@ func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, s
 		pod.Metrics.Store(metricName, metricValue)
 	} else if scope == metrics.PodModelMetricScope {
 		if modelName == "" {
-			return fmt.Errorf("modelName should not be empty for scope %v", scope)
+			modelName = getPodLabel(pod, modelLabel, defaultModelName)
+			if modelName == "" {
+				return fmt.Errorf("modelName should not be empty for scope %v", scope)
+			}
 		}
 		pod.ModelMetrics.Store(c.getPodModelMetricName(modelName, metricName), metricValue)
 	} else {
@@ -406,3 +407,12 @@ func getPodMetricPort(pod *Pod) int {
 	}
 	return defaultMetricPort
 }
+
+func getPodLabel(pod *Pod, labelName string, defaultValue string) string {
+	labelTarget, ok := pod.Labels[labelName]
+	if !ok {
+		klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabel)
+		return defaultValue
+	}
+	return labelTarget
+}

From 7fea5d7898691abc80ad6f36ed5a72346d1b4ec3 Mon Sep 17 00:00:00 2001
From: happyandslow <le.xu@bytedance.com>
Date: Wed, 23 Jul 2025 22:07:21 +0000
Subject: [PATCH 11/19] change busy time metrics type

Signed-off-by: happyandslow <le.xu@bytedance.com>
---
 pkg/plugins/gateway/algorithms/least_busy_time.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go
index fea73cf25..82cf3c2f2 100644
--- a/pkg/plugins/gateway/algorithms/least_busy_time.go
+++ b/pkg/plugins/gateway/algorithms/least_busy_time.go
@@ -53,7 +53,7 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 	minBusyTimeRatio := math.MaxFloat64 // <= 1 in general
 
 	for _, pod := range readyPodList.All() {
-		busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock
+		busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUBusyTimeRatio) // todo: replace mock
 		if err != nil {
 			klog.Error(err)
 			continue

From 99dff785af91f60dcb8c910e023eeaa0e53556d6 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 11:31:44 -0700
Subject: [PATCH 12/19] addressing comments

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go                    | 34 +++----------------
 pkg/cache/utils.go                            | 30 ++++++++++++++++
 pkg/metrics/metrics.go                        |  3 +-
 .../gateway/algorithms/least_busy_time.go     |  7 ++--
 .../gateway/algorithms/least_gpu_cache.go     |  8 ++---
 pkg/plugins/gateway/algorithms/least_util.go  |  7 ++--
 6 files changed, 48 insertions(+), 41 deletions(-)
 create mode 100644 pkg/cache/utils.go

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index 0d05bc12c..af3e451d4 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -17,7 +17,6 @@ package cache
 import (
 	"context"
 	"fmt"
-	"strconv"
 	"time"
 
 	prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
@@ -28,16 +27,16 @@ import (
 )
 
 const (
-	defaultMetricPort = 8000
 	// When the engine's HTTP proxy is separated from the engine itself,
 	// the request port and metrics port may differ, so a dedicated metrics port is required.
 	MetricPortLabel                     = "model.aibrix.ai/metric-port"
-	defaultPodMetricRefreshIntervalInMS = 50
 	engineLabel                         = "model.aibrix.ai/engine"
 	portLabel                           = "model.aibrix.ai/port"
 	modelLabel                          = "model.aibrix.ai/name"
-	defaultEngineLabel                  = "vllm"
+	defaultMetricPort                   = 8000
+	defaultEngineLabelValue             = "vllm"
 	defaultModelName                    = ""
+	defaultPodMetricRefreshIntervalInMS = 50
 )
 
 var (
@@ -49,7 +48,7 @@ var (
 		metrics.AvgGenerationThroughputToksPerS,
 		metrics.GPUCacheUsagePerc,
 		metrics.CPUCacheUsagePerc,
-		metrics.GPUBusyTimeRatio,
+		metrics.EngineUtilization,
 	}
 
 	// histogram metric example - time_to_first_token_seconds, _sum, _bucket _count.
@@ -340,7 +339,7 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
 		klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName)
 		return nil, false
 	}
-	engineType := getPodLabel(pod, engineLabel, defaultEngineLabel)
+	engineType := getPodLabel(pod, engineLabel, defaultEngineLabelValue)
 	rawMetricName, ok := metric.RawMetricNameMapping[engineType]
 	if !ok {
 		klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
@@ -393,26 +392,3 @@ func (c *Store) aggregateMetrics() {
 		}
 	}
 }
-
-func getPodMetricPort(pod *Pod) int {
-	if pod == nil || pod.Labels == nil {
-		return defaultMetricPort
-	}
-	if v, ok := pod.Labels[MetricPortLabel]; ok && v != "" {
-		if p, err := strconv.Atoi(v); err == nil {
-			return p
-		} else {
-			klog.Warningf("Invalid value for label %s on pod %s/%s: %q. Using default port %d.", MetricPortLabel, pod.Namespace, pod.Name, v, defaultMetricPort)
-		}
-	}
-	return defaultMetricPort
-}
-
-func getPodLabel(pod *Pod, labelName string, defaultValue string) string {
-	labelTarget, ok := pod.Labels[labelName]
-	if !ok {
-		klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabel)
-		return defaultValue
-	}
-	return labelTarget
-}
diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go
new file mode 100644
index 000000000..3a232d2a9
--- /dev/null
+++ b/pkg/cache/utils.go
@@ -0,0 +1,30 @@
+package cache
+
+import (
+	"strconv"
+
+	"k8s.io/klog/v2"
+)
+
+func getPodMetricPort(pod *Pod) int {
+	if pod == nil || pod.Labels == nil {
+		return defaultMetricPort
+	}
+	if v, ok := pod.Labels[MetricPortLabel]; ok && v != "" {
+		if p, err := strconv.Atoi(v); err == nil {
+			return p
+		} else {
+			klog.Warningf("Invalid value for label %s on pod %s/%s: %q. Using default port %d.", MetricPortLabel, pod.Namespace, pod.Name, v, defaultMetricPort)
+		}
+	}
+	return defaultMetricPort
+}
+
+func getPodLabel(pod *Pod, labelName string, defaultValue string) string {
+	labelTarget, ok := pod.Labels[labelName]
+	if !ok {
+		klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabelValue)
+		return defaultValue
+	}
+	return labelTarget
+}
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index 5c48a9822..c163f21d7 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -40,6 +40,7 @@ const (
 	GPUCacheUsagePerc                    = "gpu_cache_usage_perc"
 	GPUBusyTimeRatio                     = "gpu_busy_time_ratio"
 	CPUCacheUsagePerc                    = "cpu_cache_usage_perc"
+	EngineUtilization                    = "engine_utilization"
 	AvgE2ELatencyPod                     = "avg_e2e_latency_pod"
 	AvgRequestsPerMinPod                 = "avg_requests_per_min_pod"
 	AvgPromptThroughputToksPerMinPod     = "avg_prompt_throughput_toks_per_min_pod"
@@ -283,7 +284,7 @@ var (
 			},
 			Description: "GPU cache usage percentage",
 		},
-		GPUBusyTimeRatio: {
+		EngineUtilization: {
 			MetricScope:  PodModelMetricScope,
 			MetricSource: PodRawMetrics,
 			MetricType: MetricType{
diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go
index 82cf3c2f2..56ced9bce 100644
--- a/pkg/plugins/gateway/algorithms/least_busy_time.go
+++ b/pkg/plugins/gateway/algorithms/least_busy_time.go
@@ -53,13 +53,13 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 	minBusyTimeRatio := math.MaxFloat64 // <= 1 in general
 
 	for _, pod := range readyPodList.All() {
-		busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUBusyTimeRatio) // todo: replace mock
+		busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock
 		if err != nil {
 			klog.Error(err)
 			continue
 		}
 		busyTimeRatioValue := busyTimeRatio.GetSimpleValue()
-		klog.Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
+		klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
 
 		if busyTimeRatioValue < minBusyTimeRatio {
 			minBusyTimeRatio = busyTimeRatioValue
@@ -74,9 +74,6 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types
 		if err != nil {
 			return "", err
 		}
-		klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP)
-	} else {
-		klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio)
 	}
 
 	ctx.SetTargetPod(targetPod)
diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
index f7ceee83d..76beafd31 100644
--- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go
+++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
@@ -64,7 +64,7 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types
 		}
 		totalCache := gpuCache.GetSimpleValue()
 
-		klog.Infof("pod: %v, podIP: %v, gpuCache: %v",
+		klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v",
 			pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue())
 
 		if totalCache <= minGpuCache {
@@ -80,16 +80,16 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types
 		if err != nil {
 			return "", err
 		}
-		klog.Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
+		klog.V(4).Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
 	} else {
-		klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache)
+		klog.V(4).Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache)
 	}
 
 	if targetPod == nil {
 		return "", fmt.Errorf("no pods to forward request")
 	}
 
-	klog.Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
+	klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
 	ctx.SetTargetPod(targetPod)
 	return ctx.TargetAddress(), nil
 }
diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go
index c0201404a..9bd2d4db1 100644
--- a/pkg/plugins/gateway/algorithms/least_util.go
+++ b/pkg/plugins/gateway/algorithms/least_util.go
@@ -27,7 +27,7 @@ import (
 	klog "k8s.io/klog/v2"
 )
 
-const RouterUtil types.RoutingAlgorithm = "least-util"
+const RouterUtil types.RoutingAlgorithm = "least-utilization"
 
 func init() {
 	Register(RouterUtil, NewLeastUtilRouter)
@@ -53,7 +53,7 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod
 	minBusyTimeRatio := math.MaxFloat64 // <= 1 in general
 
 	for _, pod := range readyPodList.All() {
-		busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock
+		busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock
 		if err != nil {
 			klog.Error(err)
 			continue
@@ -74,6 +74,9 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod
 		if err != nil {
 			return "", err
 		}
+		klog.V(4).Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP)
+	} else {
+		klog.V(4).Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio)
 	}
 
 	ctx.SetTargetPod(targetPod)

From 41a0ee8a64b756693f4190b057fffcaf17d10236 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 11:36:22 -0700
Subject: [PATCH 13/19] renaming

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/plugins/gateway/algorithms/least_util.go | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go
index 9bd2d4db1..420228e08 100644
--- a/pkg/plugins/gateway/algorithms/least_util.go
+++ b/pkg/plugins/gateway/algorithms/least_util.go
@@ -50,19 +50,19 @@ func NewLeastUtilRouter() (types.Router, error) {
 
 func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
 	var targetPod *v1.Pod
-	minBusyTimeRatio := math.MaxFloat64 // <= 1 in general
+	minUtilization := math.MaxFloat64 // <= 1 in general
 
 	for _, pod := range readyPodList.All() {
-		busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock
+		utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock
 		if err != nil {
 			klog.Error(err)
 			continue
 		}
-		busyTimeRatioValue := busyTimeRatio.GetSimpleValue()
-		klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue)
+		utilizationValue := utilization.GetSimpleValue()
+		klog.V(4).Infof("pod: %v, podIP: %v, engine utilization: %v", pod.Name, pod.Status.PodIP, utilizationValue)
 
-		if busyTimeRatioValue < minBusyTimeRatio {
-			minBusyTimeRatio = busyTimeRatioValue
+		if utilizationValue < minUtilization {
+			minUtilization = utilizationValue
 			targetPod = pod
 		}
 	}
@@ -76,7 +76,7 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod
 		}
 		klog.V(4).Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP)
 	} else {
-		klog.V(4).Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio)
+		klog.V(4).Infof("select target pod: %v, podIP: %v, engine utilization: %v", targetPod.Name, targetPod.Status.PodIP, minUtilization)
 	}
 
 	ctx.SetTargetPod(targetPod)

From c280e738e1d39cbf4623976f5e5b69d245d254be Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 11:39:48 -0700
Subject: [PATCH 14/19] formatting

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go                   | 3 +--
 pkg/plugins/gateway/algorithms/least_util.go | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index af3e451d4..8ec804edd 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -37,6 +37,7 @@ const (
 	defaultEngineLabelValue             = "vllm"
 	defaultModelName                    = ""
 	defaultPodMetricRefreshIntervalInMS = 50
+	defaultPodMetricsWorkerCount        = 10
 )
 
 var (
@@ -122,8 +123,6 @@ func (c *Store) getPodModelMetricName(modelName string, metricName string) strin
 	return fmt.Sprintf("%s/%s", modelName, metricName)
 }
 
-const defaultPodMetricsWorkerCount = 10
-
 func (c *Store) updatePodMetrics() {
 	c.metaPods.Range(func(key string, metaPod *Pod) bool {
 		if !utils.FilterReadyPod(metaPod.Pod) {
diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go
index 420228e08..af5d35bf5 100644
--- a/pkg/plugins/gateway/algorithms/least_util.go
+++ b/pkg/plugins/gateway/algorithms/least_util.go
@@ -53,7 +53,7 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod
 	minUtilization := math.MaxFloat64 // <= 1 in general
 
 	for _, pod := range readyPodList.All() {
-		utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock
+		utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization)
 		if err != nil {
 			klog.Error(err)
 			continue

From 47a52847760d50a8b9018fcfc01d7cf2c094342c Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 11:52:26 -0700
Subject: [PATCH 15/19] renaming

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go |  2 +-
 pkg/metrics/metrics.go     | 38 +++++++++++++++++++-------------------
 pkg/metrics/types.go       | 14 +++++++-------
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index 8ec804edd..f9f38758d 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -339,7 +339,7 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
 		return nil, false
 	}
 	engineType := getPodLabel(pod, engineLabel, defaultEngineLabelValue)
-	rawMetricName, ok := metric.RawMetricNameMapping[engineType]
+	rawMetricName, ok := metric.EngineMetricsNameMapping[engineType]
 	if !ok {
 		klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
 		return nil, false
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index c163f21d7..3e9726a9a 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -64,7 +64,7 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm":   "vllm:num_requests_running",
 				"sglang": "sglang:num_running_reqs",
 			},
@@ -76,7 +76,7 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:num_requests_waiting",
 			},
 			Description: "Number of waiting requests",
@@ -87,7 +87,7 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:num_requests_swapped",
 			},
 			Description: "Number of swapped requests",
@@ -99,7 +99,7 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:avg_prompt_throughput_toks_per_s",
 			},
 			Description: "Average prompt throughput in tokens per second",
@@ -110,7 +110,7 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:avg_generation_throughput_toks_per_s",
 			},
 			Description: "Average generation throughput in tokens per second",
@@ -122,7 +122,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:iteration_tokens_total",
 			},
 			Description: "Total iteration tokens",
@@ -133,7 +133,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm":   "vllm:time_to_first_token_seconds",
 				"sglang": "sglang:time_to_first_token_seconds",
 			},
@@ -145,7 +145,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm":   "vllm:time_per_output_token_seconds",
 				"sglang": "sglang:inter_token_latency_seconds",
 			},
@@ -157,7 +157,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm":   "vllm:e2e_request_latency_seconds",
 				"sglang": "sglang:e2e_request_latency_seconds",
 			},
@@ -169,7 +169,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:request_queue_time_seconds",
 			},
 			Description: "Request queue time in seconds",
@@ -180,7 +180,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:request_inference_time_seconds",
 			},
 			Description: "Request inference time in seconds",
@@ -191,7 +191,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:request_decode_time_seconds",
 			},
 			Description: "Request decode time in seconds",
@@ -202,7 +202,7 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:request_prefill_time_seconds",
 			},
 			Description: "Request prefill time in seconds",
@@ -277,7 +277,7 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm":   "vllm:gpu_cache_usage_perc",
 				"sglang": "sglang:token_usage",
 				"xllm":   "kv_cache_utilization",
@@ -290,7 +290,7 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"xllm": "engine_utilization",
 			},
 			Description: "GPU busy time ratio",
@@ -301,7 +301,7 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:cpu_cache_usage_perc",
 			},
 			Description: "CPU cache usage percentage",
@@ -349,7 +349,7 @@ var (
 				Query: QueryLabel,
 			},
 			RawMetricName: "lora_requests_info",
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:max_lora",
 			},
 			Description: "Max count of Lora Adapters",
@@ -361,7 +361,7 @@ var (
 				Query: QueryLabel,
 			},
 			RawMetricName: "lora_requests_info",
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:running_lora_adapters",
 			},
 			Description: "Count of running Lora Adapters",
@@ -373,7 +373,7 @@ var (
 				Query: QueryLabel,
 			},
 			RawMetricName: "lora_requests_info",
-			RawMetricNameMapping: map[string]string{
+			EngineMetricsNameMapping: map[string]string{
 				"vllm": "vllm:waiting_lora_adapters",
 			},
 			Description: "Count of waiting Lora Adapters",
diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go
index 365f46787..8b10706c4 100644
--- a/pkg/metrics/types.go
+++ b/pkg/metrics/types.go
@@ -77,13 +77,13 @@ const (
 
 // Metric defines a unique metric with metadata.
 type Metric struct {
-	MetricSource         MetricSource
-	MetricType           MetricType
-	PromQL               string            // Optional: Only applicable for PromQL-based metrics
-	RawMetricName        string            // Optional: Only applicable for QueryLabel-based metrics
-	RawMetricNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
-	Description          string
-	MetricScope          MetricScope
+	MetricSource             MetricSource
+	MetricType               MetricType
+	PromQL                   string            // Optional: Only applicable for PromQL-based metrics
+	RawMetricName            string            // Optional: Only applicable for QueryLabel-based metrics
+	EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
+	Description              string
+	MetricScope              MetricScope
 }
 
 // MetricValue is the interface for all metric values.

From 4b58504f599020dc8e5fce6a10f07bfe30f141eb Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 11:55:07 -0700
Subject: [PATCH 16/19] adding license

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/utils.go | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go
index 3a232d2a9..d2df3e861 100644
--- a/pkg/cache/utils.go
+++ b/pkg/cache/utils.go
@@ -1,3 +1,19 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
 package cache
 
 import (

From 16ad7ee954432687cdaba0c2c968ec199c22c829 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 12:01:30 -0700
Subject: [PATCH 17/19] update comments

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/metrics/metrics.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index 3e9726a9a..4a938d4f5 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -279,7 +279,7 @@ var (
 			},
 			EngineMetricsNameMapping: map[string]string{
 				"vllm":   "vllm:gpu_cache_usage_perc",
-				"sglang": "sglang:token_usage",
+				"sglang": "sglang:token_usage", // Based on https://github.com/sgl-project/sglang/issues/5979
 				"xllm":   "kv_cache_utilization",
 			},
 			Description: "GPU cache usage percentage",

From 61fd8fbf2d8a73fe829e04cbf6463d347f189b15 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 14:58:05 -0700
Subject: [PATCH 18/19] splitting policy branch

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 .../gateway/algorithms/least_gpu_cache.go     | 95 -------------------
 pkg/plugins/gateway/algorithms/least_util.go  | 84 ----------------
 2 files changed, 179 deletions(-)
 delete mode 100644 pkg/plugins/gateway/algorithms/least_gpu_cache.go
 delete mode 100644 pkg/plugins/gateway/algorithms/least_util.go

diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go
deleted file mode 100644
index 76beafd31..000000000
--- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-Copyright 2024 The Aibrix Team.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package routingalgorithms
-
-import (
-	"fmt"
-	"math"
-	"math/rand"
-
-	"github.com/vllm-project/aibrix/pkg/cache"
-	metrics "github.com/vllm-project/aibrix/pkg/metrics"
-	"github.com/vllm-project/aibrix/pkg/types"
-	v1 "k8s.io/api/core/v1"
-	klog "k8s.io/klog/v2"
-)
-
-const RouterLeastGpuCache types.RoutingAlgorithm = "least-gpu-cache"
-
-func init() {
-	Register(RouterLeastGpuCache, NewLeastGpuCacheRouter)
-}
-
-type leastGpuCacheRouter struct {
-	cache cache.Cache
-}
-
-func NewLeastGpuCacheRouter() (types.Router, error) {
-	c, err := cache.Get()
-	if err != nil {
-		return nil, err
-	}
-
-	return leastGpuCacheRouter{
-		cache: c,
-	}, nil
-}
-
-func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
-	var targetPod *v1.Pod
-	minGpuCache := math.MaxFloat64
-
-	for _, pod := range readyPodList.All() {
-		// Due to metric refactor (pull/543) to better support lora and multi models,
-		// we change to use PodModelMetrics instead of PodMetrics in some scenarios.
-		// This works but doesn't look very promising, we can revisit this part later.
-		gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUCacheUsagePerc)
-		if err != nil {
-			klog.Error(err)
-			continue
-		}
-		totalCache := gpuCache.GetSimpleValue()
-
-		klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v",
-			pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue())
-
-		if totalCache <= minGpuCache {
-			minGpuCache = totalCache
-			targetPod = pod
-		}
-	}
-
-	// Use fallback if no valid metrics
-	if targetPod == nil {
-		var err error
-		targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn)
-		if err != nil {
-			return "", err
-		}
-		klog.V(4).Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
-	} else {
-		klog.V(4).Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache)
-	}
-
-	if targetPod == nil {
-		return "", fmt.Errorf("no pods to forward request")
-	}
-
-	klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP)
-	ctx.SetTargetPod(targetPod)
-	return ctx.TargetAddress(), nil
-}
diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go
deleted file mode 100644
index af5d35bf5..000000000
--- a/pkg/plugins/gateway/algorithms/least_util.go
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
-Copyright 2024 The Aibrix Team.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package routingalgorithms
-
-import (
-	"math"
-	"math/rand"
-
-	"github.com/vllm-project/aibrix/pkg/cache"
-	"github.com/vllm-project/aibrix/pkg/metrics"
-	"github.com/vllm-project/aibrix/pkg/types"
-	v1 "k8s.io/api/core/v1"
-	klog "k8s.io/klog/v2"
-)
-
-const RouterUtil types.RoutingAlgorithm = "least-utilization"
-
-func init() {
-	Register(RouterUtil, NewLeastUtilRouter)
-}
-
-type leastUtilRouter struct {
-	cache cache.Cache
-}
-
-func NewLeastUtilRouter() (types.Router, error) {
-	c, err := cache.Get()
-	if err != nil {
-		return nil, err
-	}
-
-	return leastUtilRouter{
-		cache: c,
-	}, nil
-}
-
-func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
-	var targetPod *v1.Pod
-	minUtilization := math.MaxFloat64 // <= 1 in general
-
-	for _, pod := range readyPodList.All() {
-		utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization)
-		if err != nil {
-			klog.Error(err)
-			continue
-		}
-		utilizationValue := utilization.GetSimpleValue()
-		klog.V(4).Infof("pod: %v, podIP: %v, engine utilization: %v", pod.Name, pod.Status.PodIP, utilizationValue)
-
-		if utilizationValue < minUtilization {
-			minUtilization = utilizationValue
-			targetPod = pod
-		}
-	}
-
-	// Use fallback if no valid metrics
-	if targetPod == nil {
-		var err error
-		targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn)
-		if err != nil {
-			return "", err
-		}
-		klog.V(4).Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP)
-	} else {
-		klog.V(4).Infof("select target pod: %v, podIP: %v, engine utilization: %v", targetPod.Name, targetPod.Status.PodIP, minUtilization)
-	}
-
-	ctx.SetTargetPod(targetPod)
-	return ctx.TargetAddress(), nil
-}

From 14fb40045398fe12b31da06a01e5c06760e85657 Mon Sep 17 00:00:00 2001
From: Le Xu <le.xu@bytedance.com>
Date: Fri, 25 Jul 2025 15:32:33 -0700
Subject: [PATCH 19/19] addressing comments

Signed-off-by: Le Xu <le.xu@bytedance.com>
---
 pkg/cache/cache_metrics.go | 12 ++++++++----
 pkg/cache/utils.go         | 10 ++++++----
 pkg/metrics/metrics.go     |  3 ++-
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index f9f38758d..3aa1400e2 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -35,7 +35,6 @@ const (
 	modelLabel                          = "model.aibrix.ai/name"
 	defaultMetricPort                   = 8000
 	defaultEngineLabelValue             = "vllm"
-	defaultModelName                    = ""
 	defaultPodMetricRefreshIntervalInMS = 50
 	defaultPodMetricsWorkerCount        = 10
 )
@@ -338,7 +337,11 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
 		klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName)
 		return nil, false
 	}
-	engineType := getPodLabel(pod, engineLabel, defaultEngineLabelValue)
+	engineType, err := getPodLabel(pod, engineLabel)
+	if engineType == "" {
+		klog.V(4).Infof(err.Error())
+		engineType = defaultEngineLabelValue
+	}
 	rawMetricName, ok := metric.EngineMetricsNameMapping[engineType]
 	if !ok {
 		klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
@@ -358,9 +361,10 @@ func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, s
 	if scope == metrics.PodMetricScope {
 		pod.Metrics.Store(metricName, metricValue)
 	} else if scope == metrics.PodModelMetricScope {
+		var err error
 		if modelName == "" {
-			modelName = getPodLabel(pod, modelLabel, defaultModelName)
-			if modelName == "" {
+			modelName, err = getPodLabel(pod, modelLabel)
+			if err != nil {
 				return fmt.Errorf("modelName should not be empty for scope %v", scope)
 			}
 		}
diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go
index d2df3e861..307093da3 100644
--- a/pkg/cache/utils.go
+++ b/pkg/cache/utils.go
@@ -1,5 +1,5 @@
 /*
-Copyright 2024 The Aibrix Team.
+Copyright 2025 The Aibrix Team.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@ limitations under the License.
 package cache
 
 import (
+	"fmt"
 	"strconv"
 
 	"k8s.io/klog/v2"
@@ -36,11 +37,12 @@ func getPodMetricPort(pod *Pod) int {
 	return defaultMetricPort
 }
 
-func getPodLabel(pod *Pod, labelName string, defaultValue string) string {
+func getPodLabel(pod *Pod, labelName string) (string, error) {
 	labelTarget, ok := pod.Labels[labelName]
 	if !ok {
 		klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabelValue)
-		return defaultValue
+		err := fmt.Errorf("error executing query: no label %v found for pod %v", labelName, pod.Name)
+		return "", err
 	}
-	return labelTarget
+	return labelTarget, nil
 }
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index 4a938d4f5..8350dd9c5 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -111,7 +111,8 @@ var (
 				Raw: Gauge,
 			},
 			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:avg_generation_throughput_toks_per_s",
+				"vllm":   "vllm:avg_generation_throughput_toks_per_s",
+				"sglang": "sglang:gen_throughput",
 			},
 			Description: "Average generation throughput in tokens per second",
 		},