Skip to content

Commit 81ace48

Browse files
[Feature] Adding raw metrics name conversion in metrics (#1293)
Signed-off-by: Le Xu <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 7fe0a91 commit 81ace48

File tree

4 files changed

+170
-33
lines changed

4 files changed

+170
-33
lines changed

pkg/cache/cache_metrics.go

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ package cache
1717
import (
1818
"context"
1919
"fmt"
20-
"strconv"
2120
"time"
2221

2322
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
@@ -28,11 +27,16 @@ import (
2827
)
2928

3029
const (
31-
defaultMetricPort = 8000
3230
// When the engine's HTTP proxy is separated from the engine itself,
3331
// the request port and metrics port may differ, so a dedicated metrics port is required.
3432
MetricPortLabel = "model.aibrix.ai/metric-port"
33+
engineLabel = "model.aibrix.ai/engine"
34+
portLabel = "model.aibrix.ai/port"
35+
modelLabel = "model.aibrix.ai/name"
36+
defaultMetricPort = 8000
37+
defaultEngineLabelValue = "vllm"
3538
defaultPodMetricRefreshIntervalInMS = 50
39+
defaultPodMetricsWorkerCount = 10
3640
)
3741

3842
var (
@@ -44,6 +48,7 @@ var (
4448
metrics.AvgGenerationThroughputToksPerS,
4549
metrics.GPUCacheUsagePerc,
4650
metrics.CPUCacheUsagePerc,
51+
metrics.EngineUtilization,
4752
}
4853

4954
// histogram metric example - time_to_first_token_seconds, _sum, _bucket _count.
@@ -117,8 +122,6 @@ func (c *Store) getPodModelMetricName(modelName string, metricName string) strin
117122
return fmt.Sprintf("%s/%s", modelName, metricName)
118123
}
119124

120-
const defaultPodMetricsWorkerCount = 10
121-
122125
func (c *Store) updatePodMetrics() {
123126
c.metaPods.Range(func(key string, metaPod *Pod) bool {
124127
if !utils.FilterReadyPod(metaPod.Pod) {
@@ -173,7 +176,7 @@ func (c *Store) updateSimpleMetricFromRawMetrics(pod *Pod, allMetrics map[string
173176
}
174177

175178
// TODO: we should refact metricName to fit other engine
176-
metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", metricName)]
179+
metricFamily, exists := c.fetchMetrics(pod, allMetrics, metricName)
177180
if !exists {
178181
klog.V(4).Infof("Cannot find %v in the pod metrics", metricName)
179182
continue
@@ -208,8 +211,7 @@ func (c *Store) updateHistogramMetricFromRawMetrics(pod *Pod, allMetrics map[str
208211
klog.V(4).Infof("Cannot find %v in the metric list", metricName)
209212
continue
210213
}
211-
212-
metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", metricName)]
214+
metricFamily, exists := c.fetchMetrics(pod, allMetrics, metricName)
213215
if !exists {
214216
klog.V(4).Infof("Cannot find %v in the pod metrics", metricName)
215217
continue
@@ -235,7 +237,6 @@ func (c *Store) updateHistogramMetricFromRawMetrics(pod *Pod, allMetrics map[str
235237
}
236238

237239
klog.V(5).InfoS("Successfully parsed metrics", "metric", metricName, "model", modelName, "PodIP", pod.Status.PodIP, "Port", podMetricPort, "metricValue", metricValue)
238-
239240
}
240241
}
241242
}
@@ -250,7 +251,7 @@ func (c *Store) updateQueryLabelMetricFromRawMetrics(pod *Pod, allMetrics map[st
250251
}
251252
rawMetricName := metric.RawMetricName
252253
scope := metric.MetricScope
253-
metricFamily, exists := allMetrics[fmt.Sprintf("vllm:%s", rawMetricName)]
254+
metricFamily, exists := c.fetchMetrics(pod, allMetrics, rawMetricName)
254255
if !exists {
255256
klog.V(4).Infof("Cannot find %v in the pod metrics", rawMetricName)
256257
continue
@@ -330,14 +331,42 @@ func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Met
330331
return nil
331332
}
332333

334+
func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, labelMetricName string) (*dto.MetricFamily, bool) {
335+
metric, exists := metrics.Metrics[labelMetricName]
336+
if !exists {
337+
klog.V(4).Infof("Cannot find labelMetricName %v in collected metrics names", labelMetricName)
338+
return nil, false
339+
}
340+
engineType, err := getPodLabel(pod, engineLabel)
341+
if engineType == "" {
342+
klog.V(4).Infof(err.Error())
343+
engineType = defaultEngineLabelValue
344+
}
345+
rawMetricName, ok := metric.EngineMetricsNameMapping[engineType]
346+
if !ok {
347+
klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
348+
return nil, false
349+
}
350+
metricFamily, exists := allMetrics[rawMetricName]
351+
if !exists {
352+
klog.V(4).Infof("Cannot find raw metrics %v, engine type %v", rawMetricName, engineType)
353+
return nil, false
354+
}
355+
return metricFamily, true
356+
}
357+
333358
// Update `PodMetrics` and `PodModelMetrics` according to the metric scope
334359
// TODO: replace in-place metric update podMetrics and podModelMetrics to fresh copy for preventing stale metric keys
335360
func (c *Store) updatePodRecord(pod *Pod, modelName string, metricName string, scope metrics.MetricScope, metricValue metrics.MetricValue) error {
336361
if scope == metrics.PodMetricScope {
337362
pod.Metrics.Store(metricName, metricValue)
338363
} else if scope == metrics.PodModelMetricScope {
364+
var err error
339365
if modelName == "" {
340-
return fmt.Errorf("modelName should not be empty for scope %v", scope)
366+
modelName, err = getPodLabel(pod, modelLabel)
367+
if err != nil {
368+
return fmt.Errorf("modelName should not be empty for scope %v", scope)
369+
}
341370
}
342371
pod.ModelMetrics.Store(c.getPodModelMetricName(modelName, metricName), metricValue)
343372
} else {
@@ -366,17 +395,3 @@ func (c *Store) aggregateMetrics() {
366395
}
367396
}
368397
}
369-
370-
func getPodMetricPort(pod *Pod) int {
371-
if pod == nil || pod.Labels == nil {
372-
return defaultMetricPort
373-
}
374-
if v, ok := pod.Labels[MetricPortLabel]; ok && v != "" {
375-
if p, err := strconv.Atoi(v); err == nil {
376-
return p
377-
} else {
378-
klog.Warningf("Invalid value for label %s on pod %s/%s: %q. Using default port %d.", MetricPortLabel, pod.Namespace, pod.Name, v, defaultMetricPort)
379-
}
380-
}
381-
return defaultMetricPort
382-
}

pkg/cache/utils.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
Copyright 2025 The Aibrix Team.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package cache
18+
19+
import (
20+
"fmt"
21+
"strconv"
22+
23+
"k8s.io/klog/v2"
24+
)
25+
26+
func getPodMetricPort(pod *Pod) int {
27+
if pod == nil || pod.Labels == nil {
28+
return defaultMetricPort
29+
}
30+
if v, ok := pod.Labels[MetricPortLabel]; ok && v != "" {
31+
if p, err := strconv.Atoi(v); err == nil {
32+
return p
33+
} else {
34+
klog.Warningf("Invalid value for label %s on pod %s/%s: %q. Using default port %d.", MetricPortLabel, pod.Namespace, pod.Name, v, defaultMetricPort)
35+
}
36+
}
37+
return defaultMetricPort
38+
}
39+
40+
func getPodLabel(pod *Pod, labelName string) (string, error) {
41+
labelTarget, ok := pod.Labels[labelName]
42+
if !ok {
43+
klog.V(4).Infof("No label %v name for pod %v, default to %v", labelName, pod.Name, defaultEngineLabelValue)
44+
err := fmt.Errorf("error executing query: no label %v found for pod %v", labelName, pod.Name)
45+
return "", err
46+
}
47+
return labelTarget, nil
48+
}

0 commit comments

Comments
 (0)