From c98ab6a2e642863671f297f040d02170b499e12e Mon Sep 17 00:00:00 2001 From: Le Xu Date: Tue, 15 Jul 2025 17:58:36 -0700 Subject: [PATCH 01/19] adding raw metrics name conversion in metrics Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 32 ++++++++++++++++--- pkg/metrics/metrics.go | 64 ++++++++++++++++++++++++++++++++++++-- pkg/metrics/types.go | 13 ++++---- 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index c1caa1c78..be5b36174 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -33,6 +33,7 @@ const ( // the request port and metrics port may differ, so a dedicated metrics port is required. MetricPortLabel = "model.aibrix.ai/metric-port" defaultPodMetricRefreshIntervalInMS = 50 + engineLabel = "model.aibrix.ai/engine" ) var ( @@ -173,7 +174,7 @@ func (c *Store) updateSimpleMetricFromRawMetrics(pod *Pod, allMetrics map[string } // TODO: we should refact metricName to fit other engine - metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", metricName)] + metricFamily, exists := c.fetchMetrics(pod, allMetrics, metricName) if !exists { klog.V(4).Infof("Cannot find %v in the pod metrics", metricName) continue @@ -208,8 +209,7 @@ func (c *Store) updateHistogramMetricFromRawMetrics(pod *Pod, allMetrics map[str klog.V(4).Infof("Cannot find %v in the metric list", metricName) continue } - - metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", metricName)] + metricFamily, exists := c.fetchMetrics(pod, allMetrics, metricName) if !exists { klog.V(4).Infof("Cannot find %v in the pod metrics", metricName) continue @@ -250,7 +250,7 @@ func (c *Store) updateQueryLabelMetricFromRawMetrics(pod *Pod, allMetrics map[st } rawMetricName := metric.RawMetricName scope := metric.MetricScope - metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", rawMetricName)] + metricFamily, exists := c.fetchMetrics(pod, allMetrics, rawMetricName) if !exists { klog.V(4).Infof("Cannot find %v in the pod metrics", rawMetricName) continue @@ -330,6 +330,30 @@ func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Met return nil } +func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, labelMetricName string) (*dto.MetricFamily, bool) { + metric, exists := metrics.Metrics[labelMetricName] + if !exists { + klog.V(4).Infof("Cannot find %v in the metric list", labelMetricName) + return nil, false + } + engineType, ok := pod.Labels[engineLabel] + if !ok { + klog.V(4).InfoS("No engine label, default to vllm", "name", pod.Name) + engineType = "vllm" + } + rawMetricName, ok := metric.RawMetricNameMapping[engineType] + if !ok { + klog.V(4).Infof("Cannot find %v in the metric list, engine type %v", labelMetricName, engineType) + return nil, false + } + metricFamily, exists := allMetrics[rawMetricName] + if !exists { + klog.V(4).Infof("Cannot find raw metrics name %v, engine type %v", rawMetricName, engineType) + return nil, false + } + return metricFamily, true +} + // Update `PodMetrics` and `PodModelMetrics` according to the metric scope // TODO: replace in-place metric update podMetrics and podModelMetrics to fresh copy for preventing stale metric keys func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, scope metrics.MetricScope, metricValue metrics.MetricValue) error { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 8a7d82a65..d4acd4ac5 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -63,6 +63,10 @@ var ( MetricType: MetricType{ Raw: Counter, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:num_requests_running", + "sglang": "sglang:num_running_reqs", + }, Description: "Number of running requests", }, NumRequestsWaiting: { @@ -71,6 +75,9 @@ var ( MetricType: MetricType{ Raw: Counter, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:num_requests_waiting", + }, Description: "Number of waiting requests", }, NumRequestsSwapped: { @@ -79,6 +86,9 @@ var ( MetricType: MetricType{ Raw: Counter, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:num_requests_swapped", + }, Description: "Number of swapped requests", }, // Gauge metrics @@ -88,6 +98,9 @@ var ( MetricType: MetricType{ Raw: Gauge, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:avg_prompt_throughput_toks_per_s", + }, Description: "Average prompt throughput in tokens per second", }, AvgGenerationThroughputToksPerS: { @@ -96,6 +109,9 @@ var ( MetricType: MetricType{ Raw: Gauge, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:avg_generation_throughput_toks_per_s", + }, Description: "Average generation throughput in tokens per second", }, // Histogram metrics @@ -105,6 +121,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:iteration_tokens_total", + }, Description: "Total iteration tokens", }, TimeToFirstTokenSeconds: { @@ -113,6 +132,10 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:time_to_first_token_seconds", + "sglang": "vllm:sglang:time_to_first_token_seconds", + }, Description: "Time to first token in seconds", }, TimePerOutputTokenSeconds: { @@ -121,6 +144,10 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:time_per_output_token_seconds", + "sglang": "sglang:inter_token_latency_seconds", + }, Description: "Time per output token in seconds", }, E2ERequestLatencySeconds: { @@ -129,6 +156,10 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:e2e_request_latency_seconds", + "sglang": "sglang:e2e_request_latency_seconds", + }, Description: "End-to-end request latency in seconds", }, RequestQueueTimeSeconds: { @@ -137,6 +168,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:request_queue_time_seconds", + }, Description: "Request queue time in seconds", }, RequestInferenceTimeSeconds: { @@ -145,6 +179,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:request_inference_time_seconds", + }, Description: "Request inference time in seconds", }, RequestDecodeTimeSeconds: { @@ -153,6 +190,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:request_decode_time_seconds", + }, Description: "Request decode time in seconds", }, RequestPrefillTimeSeconds: { @@ -161,6 +201,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:request_prefill_time_seconds", + }, Description: "Request prefill time in seconds", }, // Query-based metrics @@ -233,6 +276,9 @@ var ( MetricType: MetricType{ Raw: Counter, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:gpu_cache_usage_perc", + }, Description: "GPU cache usage percentage", }, CPUCacheUsagePerc: { @@ -241,6 +287,9 @@ var ( MetricType: MetricType{ Raw: Counter, }, + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:cpu_cache_usage_perc", + }, Description: "CPU cache usage percentage", }, AvgE2ELatencyPod: { @@ -286,7 +335,10 @@ var ( Query: QueryLabel, }, RawMetricName: "lora_requests_info", - Description: "Max count of Lora Adapters", + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:max_lora", + }, + Description: "Max count of Lora Adapters", }, RunningLoraAdapters: { MetricScope: PodMetricScope, @@ -295,7 +347,10 @@ var ( Query: QueryLabel, }, RawMetricName: "lora_requests_info", - Description: "Count of running Lora Adapters", + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:running_lora_adapters", + }, + Description: "Count of running Lora Adapters", }, WaitingLoraAdapters: { MetricScope: PodMetricScope, @@ -304,7 +359,10 @@ var ( Query: QueryLabel, }, RawMetricName: "lora_requests_info", - Description: "Count of waiting Lora Adapters", + RawMetricNameMapping: map[string]string{ + "vllm": "vllm:waiting_lora_adapters", + }, + Description: "Count of waiting Lora Adapters", }, VTCBucketSizeActive: { MetricScope: PodModelMetricScope, diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index 1e41eb297..5ca40bf11 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -77,12 +77,13 @@ const ( // Metric defines a unique metric with metadata. type Metric struct { - MetricSource MetricSource - MetricType MetricType - PromQL string // Optional: Only applicable for PromQL-based metrics - RawMetricName string // Optional: Only applicable for QueryLabel-based metrics - Description string - MetricScope MetricScope + MetricSource MetricSource + MetricType MetricType + PromQL string // Optional: Only applicable for PromQL-based metrics + RawMetricName string // Optional: Only applicable for QueryLabel-based metrics + RawMetricNameMapping map[string]string + Description string + MetricScope MetricScope } // MetricValue is the interface for all metric values. From 42a5d75beef6822012e4a71a542929fa5ee26168 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Sun, 20 Jul 2025 20:21:58 -0700 Subject: [PATCH 02/19] adding policy least gpu/util Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 1 + pkg/metrics/metrics.go | 15 ++- .../gateway/algorithms/least_gpu_cache.go | 92 +++++++++++++++++++ pkg/plugins/gateway/algorithms/least_util.go | 81 ++++++++++++++++ 4 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 pkg/plugins/gateway/algorithms/least_gpu_cache.go create mode 100644 pkg/plugins/gateway/algorithms/least_util.go diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index be5b36174..a15b22245 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -45,6 +45,7 @@ var ( metrics.AvgGenerationThroughputToksPerS, metrics.GPUCacheUsagePerc, metrics.CPUCacheUsagePerc, + metrics.GPUBusyTimeRatio, } // histogram metric example - time_to_first_token_seconds, _sum, _bucket _count. diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index d4acd4ac5..8e91405eb 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -277,10 +277,23 @@ var ( Raw: Counter, }, RawMetricNameMapping: map[string]string{ - "vllm": "vllm:gpu_cache_usage_perc", + "vllm": "vllm:gpu_cache_usage_perc", + "sglang": "sglang:token_usage", + "xllm": "kv_cache_utilization", }, Description: "GPU cache usage percentage", }, + GPUBusyTimeRatio: { + MetricScope: PodModelMetricScope, + MetricSource: PodRawMetrics, + MetricType: MetricType{ + Raw: Gauge, + }, + RawMetricNameMapping: map[string]string{ + "xllm": "engine_utilization", + }, + Description: "GPU busy time ratio", + }, CPUCacheUsagePerc: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go new file mode 100644 index 000000000..cf9686124 --- /dev/null +++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go @@ -0,0 +1,92 @@ +/* +Copyright 2024 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package routingalgorithms + +import ( + "fmt" + "math" + "math/rand" + + "github.com/vllm-project/aibrix/pkg/cache" + metrics "github.com/vllm-project/aibrix/pkg/metrics" + "github.com/vllm-project/aibrix/pkg/types" + v1 "k8s.io/api/core/v1" + klog "k8s.io/klog/v2" +) + +const RouterLeastGpuCache types.RoutingAlgorithm = "least-gpu-cache" + +func init() { + Register(RouterLeastGpuCache, NewLeastGpuCacheRouter) +} + +type leastGpuCacheRouter struct { + cache cache.Cache +} + +func NewLeastGpuCacheRouter() (types.Router, error) { + c, err := cache.Get() + if err != nil { + return nil, err + } + + return leastGpuCacheRouter{ + cache: c, + }, nil +} + +func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) { + var targetPod *v1.Pod + minGpuCache := math.MaxFloat64 + + for _, pod := range readyPodList.All() { + // Due to metric refactor (pull/543) to better support lora and multi models, + // we change to use PodModelMetrics instead of PodMetrics in some scenarios. + // This works but doesn't look very promising, we can revisit this part later. + gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUCacheUsagePerc) + if err != nil { + klog.Error(err) + continue + } + totalCache := gpuCache.GetSimpleValue() + + klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v", + pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue()) + + if totalCache <= minGpuCache { + minGpuCache = totalCache + targetPod = pod + } + } + + // Use fallback if no valid metrics + if targetPod == nil { + var err error + targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn) + if err != nil { + return "", err + } + } + + if targetPod == nil { + return "", fmt.Errorf("no pods to forward request") + } + + klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) + ctx.SetTargetPod(targetPod) + return ctx.TargetAddress(), nil +} diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go new file mode 100644 index 000000000..c0201404a --- /dev/null +++ b/pkg/plugins/gateway/algorithms/least_util.go @@ -0,0 +1,81 @@ +/* +Copyright 2024 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package routingalgorithms + +import ( + "math" + "math/rand" + + "github.com/vllm-project/aibrix/pkg/cache" + "github.com/vllm-project/aibrix/pkg/metrics" + "github.com/vllm-project/aibrix/pkg/types" + v1 "k8s.io/api/core/v1" + klog "k8s.io/klog/v2" +) + +const RouterUtil types.RoutingAlgorithm = "least-util" + +func init() { + Register(RouterUtil, NewLeastUtilRouter) +} + +type leastUtilRouter struct { + cache cache.Cache +} + +func NewLeastUtilRouter() (types.Router, error) { + c, err := cache.Get() + if err != nil { + return nil, err + } + + return leastUtilRouter{ + cache: c, + }, nil +} + +func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) { + var targetPod *v1.Pod + minBusyTimeRatio := math.MaxFloat64 // <= 1 in general + + for _, pod := range readyPodList.All() { + busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock + if err != nil { + klog.Error(err) + continue + } + busyTimeRatioValue := busyTimeRatio.GetSimpleValue() + klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) + + if busyTimeRatioValue < minBusyTimeRatio { + minBusyTimeRatio = busyTimeRatioValue + targetPod = pod + } + } + + // Use fallback if no valid metrics + if targetPod == nil { + var err error + targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn) + if err != nil { + return "", err + } + } + + ctx.SetTargetPod(targetPod) + return ctx.TargetAddress(), nil +} From 9ad913ea5927e1ab5b47f6f13c3482adb874d755 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Mon, 21 Jul 2025 15:42:06 -0700 Subject: [PATCH 03/19] enhance log for debugging purposes Signed-off-by: Le Xu --- pkg/plugins/gateway/algorithms/least_busy_time.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go index 56ced9bce..ff8b085eb 100644 --- a/pkg/plugins/gateway/algorithms/least_busy_time.go +++ b/pkg/plugins/gateway/algorithms/least_busy_time.go @@ -59,11 +59,12 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types continue } busyTimeRatioValue := busyTimeRatio.GetSimpleValue() - klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) + klog.Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) if busyTimeRatioValue < minBusyTimeRatio { minBusyTimeRatio = busyTimeRatioValue targetPod = pod + klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) } } @@ -74,6 +75,7 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types if err != nil { return "", err } + klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP) } ctx.SetTargetPod(targetPod) From b164150304c52d2c970eb2d8257b779deabadd3c Mon Sep 17 00:00:00 2001 From: happyandslow Date: Mon, 21 Jul 2025 22:51:30 +0000 Subject: [PATCH 04/19] enhance log for debugging purposes Signed-off-by: root Signed-off-by: Le Xu --- pkg/plugins/gateway/algorithms/least_busy_time.go | 4 +++- pkg/plugins/gateway/algorithms/least_gpu_cache.go | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go index ff8b085eb..ee806c482 100644 --- a/pkg/plugins/gateway/algorithms/least_busy_time.go +++ b/pkg/plugins/gateway/algorithms/least_busy_time.go @@ -64,7 +64,6 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types if busyTimeRatioValue < minBusyTimeRatio { minBusyTimeRatio = busyTimeRatioValue targetPod = pod - klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) } } @@ -77,6 +76,9 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types } klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP) } + else{ + klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, minBusyTimeRatio) + } ctx.SetTargetPod(targetPod) return ctx.TargetAddress(), nil diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go index cf9686124..2a75278c7 100644 --- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go +++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go @@ -64,7 +64,7 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types } totalCache := gpuCache.GetSimpleValue() - klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v", + klog.Infof("pod: %v, podIP: %v, gpuCache: %v", pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue()) if totalCache <= minGpuCache { @@ -80,13 +80,17 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types if err != nil { return "", err } + klog.Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) + } + else{ + klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, gpuCache.GetSimpleValue()) } if targetPod == nil { return "", fmt.Errorf("no pods to forward request") } - klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) + klog.Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) ctx.SetTargetPod(targetPod) return ctx.TargetAddress(), nil } From 4798279c117aa7c21931bed229532fe988538fc3 Mon Sep 17 00:00:00 2001 From: happyandslow Date: Mon, 21 Jul 2025 22:59:34 +0000 Subject: [PATCH 05/19] nhance log for debugging purposes v0.3.0-metrics-3 Signed-off-by: happyandslow Signed-off-by: Le Xu --- pkg/plugins/gateway/algorithms/least_busy_time.go | 5 ++--- pkg/plugins/gateway/algorithms/least_gpu_cache.go | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go index ee806c482..fea73cf25 100644 --- a/pkg/plugins/gateway/algorithms/least_busy_time.go +++ b/pkg/plugins/gateway/algorithms/least_busy_time.go @@ -75,9 +75,8 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types return "", err } klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP) - } - else{ - klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, minBusyTimeRatio) + } else { + klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio) } ctx.SetTargetPod(targetPod) diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go index 2a75278c7..f7ceee83d 100644 --- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go +++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go @@ -81,9 +81,8 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types return "", err } klog.Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) - } - else{ - klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, gpuCache.GetSimpleValue()) + } else { + klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache) } if targetPod == nil { From 080d3e990e7f9662da8dc07617f941e81698bc37 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Tue, 22 Jul 2025 10:16:15 -0700 Subject: [PATCH 06/19] enhance logs Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index a15b22245..4292e78d0 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -34,6 +34,7 @@ const ( MetricPortLabel = "model.aibrix.ai/metric-port" defaultPodMetricRefreshIntervalInMS = 50 engineLabel = "model.aibrix.ai/engine" + defaultEngineLabel = "vllm" ) var ( @@ -334,22 +335,22 @@ func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Met func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, labelMetricName string) (*dto.MetricFamily, bool) { metric, exists := metrics.Metrics[labelMetricName] if !exists { - klog.V(4).Infof("Cannot find %v in the metric list", labelMetricName) + klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName) return nil, false } engineType, ok := pod.Labels[engineLabel] if !ok { - klog.V(4).InfoS("No engine label, default to vllm", "name", pod.Name) - engineType = "vllm" + klog.V(4).Infof("No engine label pod %v, default to %v", pod.Name, defaultEngineLabel) + engineType = defaultEngineLabel } rawMetricName, ok := metric.RawMetricNameMapping[engineType] if !ok { - klog.V(4).Infof("Cannot find %v in the metric list, engine type %v", labelMetricName, engineType) + klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName) return nil, false } metricFamily, exists := allMetrics[rawMetricName] if !exists { - klog.V(4).Infof("Cannot find raw metrics name %v, engine type %v", rawMetricName, engineType) + klog.V(4).Infof("Cannot find raw metrics %v, engine type %v", rawMetricName, engineType) return nil, false } return metricFamily, true From db273edc235ca26221f514067760e3e19c48388e Mon Sep 17 00:00:00 2001 From: Le Xu Date: Tue, 22 Jul 2025 09:57:52 -0700 Subject: [PATCH 07/19] Update pkg/metrics/metrics.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Le Xu --- pkg/metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 8e91405eb..5c48a9822 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -134,7 +134,7 @@ var ( }, RawMetricNameMapping: map[string]string{ "vllm": "vllm:time_to_first_token_seconds", - "sglang": "vllm:sglang:time_to_first_token_seconds", + "sglang": "sglang:time_to_first_token_seconds", }, Description: "Time to first token in seconds", }, From 6f05475d609650e36ab28b4e4d1427f43370ef2f Mon Sep 17 00:00:00 2001 From: Le Xu Date: Tue, 22 Jul 2025 10:17:46 -0700 Subject: [PATCH 08/19] Update pkg/metrics/types.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Le Xu --- pkg/metrics/types.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index 5ca40bf11..46bff8bb3 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -81,7 +81,7 @@ type Metric struct { MetricType MetricType PromQL string // Optional: Only applicable for PromQL-based metrics RawMetricName string // Optional: Only applicable for QueryLabel-based metrics - RawMetricNameMapping map[string]string + RawMetricNameMapping map[string]string // Optional: Mapping from engine type to raw metric name. Description string MetricScope MetricScope } From 1b87c39da01c469ea4fa7215c8a0f177145b4748 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Tue, 22 Jul 2025 10:24:25 -0700 Subject: [PATCH 09/19] update format Signed-off-by: Le Xu --- pkg/metrics/types.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index 46bff8bb3..365f46787 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -79,8 +79,8 @@ const ( type Metric struct { MetricSource MetricSource MetricType MetricType - PromQL string // Optional: Only applicable for PromQL-based metrics - RawMetricName string // Optional: Only applicable for QueryLabel-based metrics + PromQL string // Optional: Only applicable for PromQL-based metrics + RawMetricName string // Optional: Only applicable for QueryLabel-based metrics RawMetricNameMapping map[string]string // Optional: Mapping from engine type to raw metric name. Description string MetricScope MetricScope From 3c294c6db3ea78b5b040df07c18fdcbc340e260a Mon Sep 17 00:00:00 2001 From: Le Xu Date: Tue, 22 Jul 2025 22:40:20 -0700 Subject: [PATCH 10/19] fix missing modelname/pod Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index 4292e78d0..0d05bc12c 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -34,7 +34,10 @@ const ( MetricPortLabel = "model.aibrix.ai/metric-port" defaultPodMetricRefreshIntervalInMS = 50 engineLabel = "model.aibrix.ai/engine" + portLabel = "model.aibrix.ai/port" + modelLabel = "model.aibrix.ai/name" defaultEngineLabel = "vllm" + defaultModelName = "" ) var ( @@ -237,7 +240,6 @@ func (c *Store) updateHistogramMetricFromRawMetrics(pod *Pod, allMetrics map[str } klog.V(5).InfoS("Successfully parsed metrics", "metric", metricName, "model", modelName, "PodIP", pod.Status.PodIP, "Port", podMetricPort, "metricValue", metricValue) - } } } @@ -338,11 +340,7 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName) return nil, false } - engineType, ok := pod.Labels[engineLabel] - if !ok { - klog.V(4).Infof("No engine label pod %v, default to %v", pod.Name, defaultEngineLabel) - engineType = defaultEngineLabel - } + engineType := getPodLabel(pod, engineLabel, defaultEngineLabel) rawMetricName, ok := metric.RawMetricNameMapping[engineType] if !ok { klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName) @@ -363,7 +361,10 @@ func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, s pod.Metrics.Store(metricName, metricValue) } else if scope == metrics.PodModelMetricScope { if modelName == "" { - return fmt.Errorf("modelName should not be empty for scope %v", scope) + modelName = getPodLabel(pod, modelLabel, defaultModelName) + if modelName == "" { + return fmt.Errorf("modelName should not be empty for scope %v", scope) + } } pod.ModelMetrics.Store(c.getPodModelMetricName(modelName, metricName), metricValue) } else { @@ -406,3 +407,12 @@ func getPodMetricPort(pod *Pod) int { } return defaultMetricPort } + +func getPodLabel(pod *Pod, labelName string, defaultValue string) string { + labelTarget, ok := pod.Labels[labelName] + if !ok { + klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabel) + return defaultValue + } + return labelTarget +} From 7fea5d7898691abc80ad6f36ed5a72346d1b4ec3 Mon Sep 17 00:00:00 2001 From: happyandslow Date: Wed, 23 Jul 2025 22:07:21 +0000 Subject: [PATCH 11/19] change busy time metrics type Signed-off-by: happyandslow --- pkg/plugins/gateway/algorithms/least_busy_time.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go index fea73cf25..82cf3c2f2 100644 --- a/pkg/plugins/gateway/algorithms/least_busy_time.go +++ b/pkg/plugins/gateway/algorithms/least_busy_time.go @@ -53,7 +53,7 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types minBusyTimeRatio := math.MaxFloat64 // <= 1 in general for _, pod := range readyPodList.All() { - busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock + busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUBusyTimeRatio) // todo: replace mock if err != nil { klog.Error(err) continue From 99dff785af91f60dcb8c910e023eeaa0e53556d6 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 11:31:44 -0700 Subject: [PATCH 12/19] addressing comments Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 34 +++---------------- pkg/cache/utils.go | 30 ++++++++++++++++ pkg/metrics/metrics.go | 3 +- .../gateway/algorithms/least_busy_time.go | 7 ++-- .../gateway/algorithms/least_gpu_cache.go | 8 ++--- pkg/plugins/gateway/algorithms/least_util.go | 7 ++-- 6 files changed, 48 insertions(+), 41 deletions(-) create mode 100644 pkg/cache/utils.go diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index 0d05bc12c..af3e451d4 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -17,7 +17,6 @@ package cache import ( "context" "fmt" - "strconv" "time" prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" @@ -28,16 +27,16 @@ import ( ) const ( - defaultMetricPort = 8000 // When the engine's HTTP proxy is separated from the engine itself, // the request port and metrics port may differ, so a dedicated metrics port is required. MetricPortLabel = "model.aibrix.ai/metric-port" - defaultPodMetricRefreshIntervalInMS = 50 engineLabel = "model.aibrix.ai/engine" portLabel = "model.aibrix.ai/port" modelLabel = "model.aibrix.ai/name" - defaultEngineLabel = "vllm" + defaultMetricPort = 8000 + defaultEngineLabelValue = "vllm" defaultModelName = "" + defaultPodMetricRefreshIntervalInMS = 50 ) var ( @@ -49,7 +48,7 @@ var ( metrics.AvgGenerationThroughputToksPerS, metrics.GPUCacheUsagePerc, metrics.CPUCacheUsagePerc, - metrics.GPUBusyTimeRatio, + metrics.EngineUtilization, } // histogram metric example - time_to_first_token_seconds, _sum, _bucket _count. @@ -340,7 +339,7 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName) return nil, false } - engineType := getPodLabel(pod, engineLabel, defaultEngineLabel) + engineType := getPodLabel(pod, engineLabel, defaultEngineLabelValue) rawMetricName, ok := metric.RawMetricNameMapping[engineType] if !ok { klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName) @@ -393,26 +392,3 @@ func (c *Store) aggregateMetrics() { } } } - -func getPodMetricPort(pod *Pod) int { - if pod == nil || pod.Labels == nil { - return defaultMetricPort - } - if v, ok := pod.Labels[MetricPortLabel]; ok && v != "" { - if p, err := strconv.Atoi(v); err == nil { - return p - } else { - klog.Warningf("Invalid value for label %s on pod %s/%s: %q. Using default port %d.", MetricPortLabel, pod.Namespace, pod.Name, v, defaultMetricPort) - } - } - return defaultMetricPort -} - -func getPodLabel(pod *Pod, labelName string, defaultValue string) string { - labelTarget, ok := pod.Labels[labelName] - if !ok { - klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabel) - return defaultValue - } - return labelTarget -} diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go new file mode 100644 index 000000000..3a232d2a9 --- /dev/null +++ b/pkg/cache/utils.go @@ -0,0 +1,30 @@ +package cache + +import ( + "strconv" + + "k8s.io/klog/v2" +) + +func getPodMetricPort(pod *Pod) int { + if pod == nil || pod.Labels == nil { + return defaultMetricPort + } + if v, ok := pod.Labels[MetricPortLabel]; ok && v != "" { + if p, err := strconv.Atoi(v); err == nil { + return p + } else { + klog.Warningf("Invalid value for label %s on pod %s/%s: %q. Using default port %d.", MetricPortLabel, pod.Namespace, pod.Name, v, defaultMetricPort) + } + } + return defaultMetricPort +} + +func getPodLabel(pod *Pod, labelName string, defaultValue string) string { + labelTarget, ok := pod.Labels[labelName] + if !ok { + klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabelValue) + return defaultValue + } + return labelTarget +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 5c48a9822..c163f21d7 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -40,6 +40,7 @@ const ( GPUCacheUsagePerc = "gpu_cache_usage_perc" GPUBusyTimeRatio = "gpu_busy_time_ratio" CPUCacheUsagePerc = "cpu_cache_usage_perc" + EngineUtilization = "engine_utilization" AvgE2ELatencyPod = "avg_e2e_latency_pod" AvgRequestsPerMinPod = "avg_requests_per_min_pod" AvgPromptThroughputToksPerMinPod = "avg_prompt_throughput_toks_per_min_pod" @@ -283,7 +284,7 @@ var ( }, Description: "GPU cache usage percentage", }, - GPUBusyTimeRatio: { + EngineUtilization: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go index 82cf3c2f2..56ced9bce 100644 --- a/pkg/plugins/gateway/algorithms/least_busy_time.go +++ b/pkg/plugins/gateway/algorithms/least_busy_time.go @@ -53,13 +53,13 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types minBusyTimeRatio := math.MaxFloat64 // <= 1 in general for _, pod := range readyPodList.All() { - busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUBusyTimeRatio) // todo: replace mock + busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock if err != nil { klog.Error(err) continue } busyTimeRatioValue := busyTimeRatio.GetSimpleValue() - klog.Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) + klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) if busyTimeRatioValue < minBusyTimeRatio { minBusyTimeRatio = busyTimeRatioValue @@ -74,9 +74,6 @@ func (r leastBusyTimeRouter) Route(ctx *types.RoutingContext, readyPodList types if err != nil { return "", err } - klog.Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP) - } else { - klog.Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio) } ctx.SetTargetPod(targetPod) diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go index f7ceee83d..76beafd31 100644 --- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go +++ b/pkg/plugins/gateway/algorithms/least_gpu_cache.go @@ -64,7 +64,7 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types } totalCache := gpuCache.GetSimpleValue() - klog.Infof("pod: %v, podIP: %v, gpuCache: %v", + klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v", pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue()) if totalCache <= minGpuCache { @@ -80,16 +80,16 @@ func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types if err != nil { return "", err } - klog.Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) + klog.V(4).Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) } else { - klog.Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache) + klog.V(4).Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache) } if targetPod == nil { return "", fmt.Errorf("no pods to forward request") } - klog.Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) + klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) ctx.SetTargetPod(targetPod) return ctx.TargetAddress(), nil } diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go index c0201404a..9bd2d4db1 100644 --- a/pkg/plugins/gateway/algorithms/least_util.go +++ b/pkg/plugins/gateway/algorithms/least_util.go @@ -27,7 +27,7 @@ import ( klog "k8s.io/klog/v2" ) -const RouterUtil types.RoutingAlgorithm = "least-util" +const RouterUtil types.RoutingAlgorithm = "least-utilization" func init() { Register(RouterUtil, NewLeastUtilRouter) @@ -53,7 +53,7 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod minBusyTimeRatio := math.MaxFloat64 // <= 1 in general for _, pod := range readyPodList.All() { - busyTimeRatio, err := r.cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.GPUBusyTimeRatio) // todo: replace mock + busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock if err != nil { klog.Error(err) continue @@ -74,6 +74,9 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod if err != nil { return "", err } + klog.V(4).Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP) + } else { + klog.V(4).Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio) } ctx.SetTargetPod(targetPod) From 41a0ee8a64b756693f4190b057fffcaf17d10236 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 11:36:22 -0700 Subject: [PATCH 13/19] renaming Signed-off-by: Le Xu --- pkg/plugins/gateway/algorithms/least_util.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go index 9bd2d4db1..420228e08 100644 --- a/pkg/plugins/gateway/algorithms/least_util.go +++ b/pkg/plugins/gateway/algorithms/least_util.go @@ -50,19 +50,19 @@ func NewLeastUtilRouter() (types.Router, error) { func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) { var targetPod *v1.Pod - minBusyTimeRatio := math.MaxFloat64 // <= 1 in general + minUtilization := math.MaxFloat64 // <= 1 in general for _, pod := range readyPodList.All() { - busyTimeRatio, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock + utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock if err != nil { klog.Error(err) continue } - busyTimeRatioValue := busyTimeRatio.GetSimpleValue() - klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatioValue) + utilizationValue := utilization.GetSimpleValue() + klog.V(4).Infof("pod: %v, podIP: %v, engine utilization: %v", pod.Name, pod.Status.PodIP, utilizationValue) - if busyTimeRatioValue < minBusyTimeRatio { - minBusyTimeRatio = busyTimeRatioValue + if utilizationValue < minUtilization { + minUtilization = utilizationValue targetPod = pod } } @@ -76,7 +76,7 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod } klog.V(4).Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP) } else { - klog.V(4).Infof("select target pod: %v, podIP: %v, GPU busy time ratio: %v", targetPod.Name, targetPod.Status.PodIP, minBusyTimeRatio) + klog.V(4).Infof("select target pod: %v, podIP: %v, engine utilization: %v", targetPod.Name, targetPod.Status.PodIP, minUtilization) } ctx.SetTargetPod(targetPod) From c280e738e1d39cbf4623976f5e5b69d245d254be Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 11:39:48 -0700 Subject: [PATCH 14/19] formatting Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 3 +-- pkg/plugins/gateway/algorithms/least_util.go | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index af3e451d4..8ec804edd 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -37,6 +37,7 @@ const ( defaultEngineLabelValue = "vllm" defaultModelName = "" defaultPodMetricRefreshIntervalInMS = 50 + defaultPodMetricsWorkerCount = 10 ) var ( @@ -122,8 +123,6 @@ func (c *Store) getPodModelMetricName(modelName string, metricName string) strin return fmt.Sprintf("%s/%s", modelName, metricName) } -const defaultPodMetricsWorkerCount = 10 - func (c *Store) updatePodMetrics() { c.metaPods.Range(func(key string, metaPod *Pod) bool { if !utils.FilterReadyPod(metaPod.Pod) { diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go index 420228e08..af5d35bf5 100644 --- a/pkg/plugins/gateway/algorithms/least_util.go +++ b/pkg/plugins/gateway/algorithms/least_util.go @@ -53,7 +53,7 @@ func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.Pod minUtilization := math.MaxFloat64 // <= 1 in general for _, pod := range readyPodList.All() { - utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) // todo: replace mock + utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) if err != nil { klog.Error(err) continue From 47a52847760d50a8b9018fcfc01d7cf2c094342c Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 11:52:26 -0700 Subject: [PATCH 15/19] renaming Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 2 +- pkg/metrics/metrics.go | 38 +++++++++++++++++++------------------- pkg/metrics/types.go | 14 +++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index 8ec804edd..f9f38758d 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -339,7 +339,7 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, return nil, false } engineType := getPodLabel(pod, engineLabel, defaultEngineLabelValue) - rawMetricName, ok := metric.RawMetricNameMapping[engineType] + rawMetricName, ok := metric.EngineMetricsNameMapping[engineType] if !ok { klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName) return nil, false diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index c163f21d7..3e9726a9a 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -64,7 +64,7 @@ var ( MetricType: MetricType{ Raw: Counter, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_running", "sglang": "sglang:num_running_reqs", }, @@ -76,7 +76,7 @@ var ( MetricType: MetricType{ Raw: Counter, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_waiting", }, Description: "Number of waiting requests", @@ -87,7 +87,7 @@ var ( MetricType: MetricType{ Raw: Counter, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:num_requests_swapped", }, Description: "Number of swapped requests", @@ -99,7 +99,7 @@ var ( MetricType: MetricType{ Raw: Gauge, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:avg_prompt_throughput_toks_per_s", }, Description: "Average prompt throughput in tokens per second", @@ -110,7 +110,7 @@ var ( MetricType: MetricType{ Raw: Gauge, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:avg_generation_throughput_toks_per_s", }, Description: "Average generation throughput in tokens per second", @@ -122,7 +122,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:iteration_tokens_total", }, Description: "Total iteration tokens", @@ -133,7 +133,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:time_to_first_token_seconds", "sglang": "sglang:time_to_first_token_seconds", }, @@ -145,7 +145,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:time_per_output_token_seconds", "sglang": "sglang:inter_token_latency_seconds", }, @@ -157,7 +157,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:e2e_request_latency_seconds", "sglang": "sglang:e2e_request_latency_seconds", }, @@ -169,7 +169,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_queue_time_seconds", }, Description: "Request queue time in seconds", @@ -180,7 +180,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_inference_time_seconds", }, Description: "Request inference time in seconds", @@ -191,7 +191,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_decode_time_seconds", }, Description: "Request decode time in seconds", @@ -202,7 +202,7 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:request_prefill_time_seconds", }, Description: "Request prefill time in seconds", @@ -277,7 +277,7 @@ var ( MetricType: MetricType{ Raw: Counter, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:gpu_cache_usage_perc", "sglang": "sglang:token_usage", "xllm": "kv_cache_utilization", @@ -290,7 +290,7 @@ var ( MetricType: MetricType{ Raw: Gauge, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "xllm": "engine_utilization", }, Description: "GPU busy time ratio", @@ -301,7 +301,7 @@ var ( MetricType: MetricType{ Raw: Counter, }, - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:cpu_cache_usage_perc", }, Description: "CPU cache usage percentage", @@ -349,7 +349,7 @@ var ( Query: QueryLabel, }, RawMetricName: "lora_requests_info", - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:max_lora", }, Description: "Max count of Lora Adapters", @@ -361,7 +361,7 @@ var ( Query: QueryLabel, }, RawMetricName: "lora_requests_info", - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:running_lora_adapters", }, Description: "Count of running Lora Adapters", @@ -373,7 +373,7 @@ var ( Query: QueryLabel, }, RawMetricName: "lora_requests_info", - RawMetricNameMapping: map[string]string{ + EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:waiting_lora_adapters", }, Description: "Count of waiting Lora Adapters", diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index 365f46787..8b10706c4 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -77,13 +77,13 @@ const ( // Metric defines a unique metric with metadata. type Metric struct { - MetricSource MetricSource - MetricType MetricType - PromQL string // Optional: Only applicable for PromQL-based metrics - RawMetricName string // Optional: Only applicable for QueryLabel-based metrics - RawMetricNameMapping map[string]string // Optional: Mapping from engine type to raw metric name. - Description string - MetricScope MetricScope + MetricSource MetricSource + MetricType MetricType + PromQL string // Optional: Only applicable for PromQL-based metrics + RawMetricName string // Optional: Only applicable for QueryLabel-based metrics + EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name. + Description string + MetricScope MetricScope } // MetricValue is the interface for all metric values. From 4b58504f599020dc8e5fce6a10f07bfe30f141eb Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 11:55:07 -0700 Subject: [PATCH 16/19] adding license Signed-off-by: Le Xu --- pkg/cache/utils.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go index 3a232d2a9..d2df3e861 100644 --- a/pkg/cache/utils.go +++ b/pkg/cache/utils.go @@ -1,3 +1,19 @@ +/* +Copyright 2024 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package cache import ( From 16ad7ee954432687cdaba0c2c968ec199c22c829 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 12:01:30 -0700 Subject: [PATCH 17/19] update comments Signed-off-by: Le Xu --- pkg/metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 3e9726a9a..4a938d4f5 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -279,7 +279,7 @@ var ( }, EngineMetricsNameMapping: map[string]string{ "vllm": "vllm:gpu_cache_usage_perc", - "sglang": "sglang:token_usage", + "sglang": "sglang:token_usage", // Based on https://github.com/sgl-project/sglang/issues/5979 "xllm": "kv_cache_utilization", }, Description: "GPU cache usage percentage", From 61fd8fbf2d8a73fe829e04cbf6463d347f189b15 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 14:58:05 -0700 Subject: [PATCH 18/19] splitting policy branch Signed-off-by: Le Xu --- .../gateway/algorithms/least_gpu_cache.go | 95 ------------------- pkg/plugins/gateway/algorithms/least_util.go | 84 ---------------- 2 files changed, 179 deletions(-) delete mode 100644 pkg/plugins/gateway/algorithms/least_gpu_cache.go delete mode 100644 pkg/plugins/gateway/algorithms/least_util.go diff --git a/pkg/plugins/gateway/algorithms/least_gpu_cache.go b/pkg/plugins/gateway/algorithms/least_gpu_cache.go deleted file mode 100644 index 76beafd31..000000000 --- a/pkg/plugins/gateway/algorithms/least_gpu_cache.go +++ /dev/null @@ -1,95 +0,0 @@ -/* -Copyright 2024 The Aibrix Team. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package routingalgorithms - -import ( - "fmt" - "math" - "math/rand" - - "github.com/vllm-project/aibrix/pkg/cache" - metrics "github.com/vllm-project/aibrix/pkg/metrics" - "github.com/vllm-project/aibrix/pkg/types" - v1 "k8s.io/api/core/v1" - klog "k8s.io/klog/v2" -) - -const RouterLeastGpuCache types.RoutingAlgorithm = "least-gpu-cache" - -func init() { - Register(RouterLeastGpuCache, NewLeastGpuCacheRouter) -} - -type leastGpuCacheRouter struct { - cache cache.Cache -} - -func NewLeastGpuCacheRouter() (types.Router, error) { - c, err := cache.Get() - if err != nil { - return nil, err - } - - return leastGpuCacheRouter{ - cache: c, - }, nil -} - -func (r leastGpuCacheRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) { - var targetPod *v1.Pod - minGpuCache := math.MaxFloat64 - - for _, pod := range readyPodList.All() { - // Due to metric refactor (pull/543) to better support lora and multi models, - // we change to use PodModelMetrics instead of PodMetrics in some scenarios. - // This works but doesn't look very promising, we can revisit this part later. - gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUCacheUsagePerc) - if err != nil { - klog.Error(err) - continue - } - totalCache := gpuCache.GetSimpleValue() - - klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v", - pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue()) - - if totalCache <= minGpuCache { - minGpuCache = totalCache - targetPod = pod - } - } - - // Use fallback if no valid metrics - if targetPod == nil { - var err error - targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn) - if err != nil { - return "", err - } - klog.V(4).Infof("select random targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) - } else { - klog.V(4).Infof("select targetPod: %s(%s) gpuCache: %v", targetPod.Name, targetPod.Status.PodIP, minGpuCache) - } - - if targetPod == nil { - return "", fmt.Errorf("no pods to forward request") - } - - klog.V(4).Infof("targetPod: %s(%s)", targetPod.Name, targetPod.Status.PodIP) - ctx.SetTargetPod(targetPod) - return ctx.TargetAddress(), nil -} diff --git a/pkg/plugins/gateway/algorithms/least_util.go b/pkg/plugins/gateway/algorithms/least_util.go deleted file mode 100644 index af5d35bf5..000000000 --- a/pkg/plugins/gateway/algorithms/least_util.go +++ /dev/null @@ -1,84 +0,0 @@ -/* -Copyright 2024 The Aibrix Team. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package routingalgorithms - -import ( - "math" - "math/rand" - - "github.com/vllm-project/aibrix/pkg/cache" - "github.com/vllm-project/aibrix/pkg/metrics" - "github.com/vllm-project/aibrix/pkg/types" - v1 "k8s.io/api/core/v1" - klog "k8s.io/klog/v2" -) - -const RouterUtil types.RoutingAlgorithm = "least-utilization" - -func init() { - Register(RouterUtil, NewLeastUtilRouter) -} - -type leastUtilRouter struct { - cache cache.Cache -} - -func NewLeastUtilRouter() (types.Router, error) { - c, err := cache.Get() - if err != nil { - return nil, err - } - - return leastUtilRouter{ - cache: c, - }, nil -} - -func (r leastUtilRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) { - var targetPod *v1.Pod - minUtilization := math.MaxFloat64 // <= 1 in general - - for _, pod := range readyPodList.All() { - utilization, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.EngineUtilization) - if err != nil { - klog.Error(err) - continue - } - utilizationValue := utilization.GetSimpleValue() - klog.V(4).Infof("pod: %v, podIP: %v, engine utilization: %v", pod.Name, pod.Status.PodIP, utilizationValue) - - if utilizationValue < minUtilization { - minUtilization = utilizationValue - targetPod = pod - } - } - - // Use fallback if no valid metrics - if targetPod == nil { - var err error - targetPod, err = SelectRandomPodAsFallback(ctx, readyPodList.All(), rand.Intn) - if err != nil { - return "", err - } - klog.V(4).Infof("select random pod: %v, podIP: %v", targetPod.Name, targetPod.Status.PodIP) - } else { - klog.V(4).Infof("select target pod: %v, podIP: %v, engine utilization: %v", targetPod.Name, targetPod.Status.PodIP, minUtilization) - } - - ctx.SetTargetPod(targetPod) - return ctx.TargetAddress(), nil -} From 14fb40045398fe12b31da06a01e5c06760e85657 Mon Sep 17 00:00:00 2001 From: Le Xu Date: Fri, 25 Jul 2025 15:32:33 -0700 Subject: [PATCH 19/19] addressing comments Signed-off-by: Le Xu --- pkg/cache/cache_metrics.go | 12 ++++++++---- pkg/cache/utils.go | 10 ++++++---- pkg/metrics/metrics.go | 3 ++- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index f9f38758d..3aa1400e2 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -35,7 +35,6 @@ const ( modelLabel = "model.aibrix.ai/name" defaultMetricPort = 8000 defaultEngineLabelValue = "vllm" - defaultModelName = "" defaultPodMetricRefreshIntervalInMS = 50 defaultPodMetricsWorkerCount = 10 ) @@ -338,7 +337,11 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName) return nil, false } - engineType := getPodLabel(pod, engineLabel, defaultEngineLabelValue) + engineType, err := getPodLabel(pod, engineLabel) + if engineType == "" { + klog.V(4).Infof(err.Error()) + engineType = defaultEngineLabelValue + } rawMetricName, ok := metric.EngineMetricsNameMapping[engineType] if !ok { klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName) @@ -358,9 +361,10 @@ func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, s if scope == metrics.PodMetricScope { pod.Metrics.Store(metricName, metricValue) } else if scope == metrics.PodModelMetricScope { + var err error if modelName == "" { - modelName = getPodLabel(pod, modelLabel, defaultModelName) - if modelName == "" { + modelName, err = getPodLabel(pod, modelLabel) + if err != nil { return fmt.Errorf("modelName should not be empty for scope %v", scope) } } diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go index d2df3e861..307093da3 100644 --- a/pkg/cache/utils.go +++ b/pkg/cache/utils.go @@ -1,5 +1,5 @@ /* -Copyright 2024 The Aibrix Team. +Copyright 2025 The Aibrix Team. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ limitations under the License. package cache import ( + "fmt" "strconv" "k8s.io/klog/v2" @@ -36,11 +37,12 @@ func getPodMetricPort(pod *Pod) int { return defaultMetricPort } -func getPodLabel(pod *Pod, labelName string, defaultValue string) string { +func getPodLabel(pod *Pod, labelName string) (string, error) { labelTarget, ok := pod.Labels[labelName] if !ok { klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabelValue) - return defaultValue + err := fmt.Errorf("error executing query: no label %v found for pod %v", labelName, pod.Name) + return "", err } - return labelTarget + return labelTarget, nil } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 4a938d4f5..8350dd9c5 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -111,7 +111,8 @@ var ( Raw: Gauge, }, EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:avg_generation_throughput_toks_per_s", + "vllm": "vllm:avg_generation_throughput_toks_per_s", + "sglang": "sglang:gen_throughput", }, Description: "Average generation throughput in tokens per second", },