kubernetes-sigs
diff --git a/‎cmd/epp/runner/runner.go
Lines changed: 19 additions & 7 deletions b/‎cmd/epp/runner/runner.go
Lines changed: 19 additions & 7 deletions
diff --git a/‎pkg/epp/backend/metrics/fake.go
Lines changed: 6 additions & 0 deletions b/‎pkg/epp/backend/metrics/fake.go
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/epp/backend/metrics/logger.go
Lines changed: 10 additions & 11 deletions b/‎pkg/epp/backend/metrics/logger.go
Lines changed: 10 additions & 11 deletions
diff --git a/‎pkg/epp/backend/metrics/pod_metrics.go
Lines changed: 10 additions & 5 deletions b/‎pkg/epp/backend/metrics/pod_metrics.go
Lines changed: 10 additions & 5 deletions
diff --git a/‎pkg/epp/backend/metrics/types.go
Lines changed: 15 additions & 9 deletions b/‎pkg/epp/backend/metrics/types.go
Lines changed: 15 additions & 9 deletions
diff --git a/‎pkg/epp/common/config/defaults.go
Lines changed: 12 additions & 3 deletions b/‎pkg/epp/common/config/defaults.go
Lines changed: 12 additions & 3 deletions
diff --git a/‎pkg/epp/datastore/datastore.go
Lines changed: 4 additions & 8 deletions b/‎pkg/epp/datastore/datastore.go
Lines changed: 4 additions & 8 deletions
diff --git a/‎pkg/epp/datastore/datastore_test.go
Lines changed: 19 additions & 12 deletions b/‎pkg/epp/datastore/datastore_test.go
Lines changed: 19 additions & 12 deletions
diff --git a/‎pkg/epp/metrics/collectors/inference_pool.go
Lines changed: 2 additions & 1 deletion b/‎pkg/epp/metrics/collectors/inference_pool.go
Lines changed: 2 additions & 1 deletion
diff --git a/‎pkg/epp/requestcontrol/director_test.go
Lines changed: 1 addition & 1 deletion b/‎pkg/epp/requestcontrol/director_test.go
Lines changed: 1 addition & 1 deletion
@@ -42,6 +42,7 @@ import (
 
 	"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config/loader"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -64,7 +65,7 @@ import (
 var (
 	grpcPort = flag.Int(
 		"grpc-port",
-		runserver.DefaultGrpcPort,
+		config.DefaultGrpcPort,
 		"The gRPC port used for communicating with Envoy proxy")
 	grpcHealthPort = flag.Int(
 		"grpc-health-port",
@@ -80,20 +81,20 @@ var (
 		"Enables pprof handlers. Defaults to true. Set to false to disable pprof handlers.")
 	destinationEndpointHintKey = flag.String(
 		"destination-endpoint-hint-key",
-		runserver.DefaultDestinationEndpointHintKey,
+		config.DefaultDestinationEndpointHintKey,
 		"Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.")
 	destinationEndpointHintMetadataNamespace = flag.String(
 		"destination-endpoint-hint-metadata-namespace",
-		runserver.DefaultDestinationEndpointHintMetadataNamespace,
+		config.DefaultDestinationEndpointHintMetadataNamespace,
 		"The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+
 			"target endpoint. If not set, then an outer namespace struct should not be created.")
 	poolName = flag.String(
 		"pool-name",
-		runserver.DefaultPoolName,
+		config.DefaultPoolName,
 		"Name of the InferencePool this Endpoint Picker is associated with.")
 	poolNamespace = flag.String(
 		"pool-namespace",
-		runserver.DefaultPoolNamespace,
+		config.DefaultPoolNamespace,
 		"Namespace of the InferencePool this Endpoint Picker is associated with.")
 	refreshMetricsInterval = flag.Duration(
 		"refresh-metrics-interval",
@@ -109,11 +110,11 @@ var (
 		"number for the log level verbosity")
 	secureServing = flag.Bool(
 		"secure-serving",
-		runserver.DefaultSecureServing,
+		config.DefaultSecureServing,
 		"Enables secure serving. Defaults to true.")
 	healthChecking = flag.Bool(
 		"health-checking",
-		runserver.DefaultHealthChecking,
+		config.DefaultHealthChecking,
 		"Enables health checking")
 	certPath = flag.String(
 		"cert-path",
@@ -135,6 +136,16 @@ var (
 		"lora-info-metric",
 		runserver.DefaultLoraInfoMetric,
 		"Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
+
+	// metrics related flags
+	refreshMetricsInterval = flag.Duration(
+		"refreshMetricsInterval",
+		config.DefaultRefreshMetricsInterval,
+		"interval to refresh metrics")
+	refreshPrometheusMetricsInterval = flag.Duration(
+		"refreshPrometheusMetricsInterval",
+		config.DefaultRefreshPrometheusMetricsInterval,
+		"interval to flush prometheus metrics")
 	metricsStalenessThreshold = flag.Duration("metricsStalenessThreshold",
 		config.DefaultMetricsStalenessThreshold,
 		"Duration after which metrics are considered stale. This is used to determine if a pod's metrics "+
@@ -339,6 +350,7 @@ func (r *Runner) Run(ctx context.Context) error {
 		HealthChecking:                           *healthChecking,
 		CertPath:                                 *certPath,
 		RefreshPrometheusMetricsInterval:         *refreshPrometheusMetricsInterval,
+		MetricsStalenessThreshold:                *metricsStalenessThreshold,
 		Director:                                 director,
 		SaturationDetector:                       saturationDetector,
 	}
 
@@ -20,12 +20,14 @@ import (
 	"context"
 	"fmt"
 	"sync"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
@@ -50,6 +52,10 @@ func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) {
 }
 func (fpm *FakePodMetrics) StopRefreshLoop() {} // noop
 
+func (fpm *FakePodMetrics) GetMetricsStalenessThreshold() time.Duration {
+	return config.DefaultMetricsStalenessThreshold
+}
+
 type FakePodMetricsClient struct {
 	errMu sync.RWMutex
 	Err   map[types.NamespacedName]error
 
@@ -30,23 +30,20 @@ import (
 )
 
 const (
-	// Note currently the EPP treats stale metrics same as fresh.
-	// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/336
-	metricsValidityPeriod = 5 * time.Second
-	debugPrintInterval    = 5 * time.Second
+	debugPrintInterval = 5 * time.Second
 )
 
 type Datastore interface {
 	PoolGet() (*v1.InferencePool, error)
 	// PodMetrics operations
-	// PodGetAll returns all pods and metrics, including fresh and stale.
+	// PodGetAll returns all pods and metrics.
 	PodGetAll() []PodMetrics
 	PodList(func(PodMetrics) bool) []PodMetrics
 }
 
 // StartMetricsLogger starts goroutines to 1) Print metrics debug logs if the DEBUG log level is
 // enabled; 2) flushes Prometheus metrics about the backend servers.
-func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval time.Duration) {
+func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval, metricsStalenessThreshold time.Duration) {
 	logger := log.FromContext(ctx)
 	ticker := time.NewTicker(refreshPrometheusMetricsInterval)
 	go func() {
@@ -57,7 +54,7 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 				logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")
 				return
 			case <-ticker.C: // Periodically refresh prometheus metrics for inference pool
-				refreshPrometheusMetrics(logger, datastore)
+				refreshPrometheusMetrics(logger, datastore, metricsStalenessThreshold)
 			}
 		}
 	}()
@@ -74,10 +71,10 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 					return
 				case <-ticker.C:
 					podsWithFreshMetrics := datastore.PodList(func(pm PodMetrics) bool {
-						return time.Since(pm.GetMetrics().UpdateTime) <= metricsValidityPeriod
+						return time.Since(pm.GetMetrics().UpdateTime) <= metricsStalenessThreshold
 					})
 					podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool {
-						return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod
+						return time.Since(pm.GetMetrics().UpdateTime) > metricsStalenessThreshold
 					})
 					s := fmt.Sprintf("Current Pods and metrics gathered. Fresh metrics: %+v, Stale metrics: %+v", podsWithFreshMetrics, podsWithStaleMetrics)
 					logger.V(logutil.VERBOSE).Info(s)
@@ -87,7 +84,7 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 	}
 }
 
-func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {
+func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore, metricsStalenessThreshold time.Duration) {
 	pool, err := datastore.PoolGet()
 	if err != nil {
 		// No inference pool or not initialize.
@@ -98,7 +95,9 @@ func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {
 	var kvCacheTotal float64
 	var queueTotal int
 
-	podMetrics := datastore.PodGetAll()
+	podMetrics := datastore.PodList(func(pm PodMetrics) bool {
+		return time.Since(pm.GetMetrics().UpdateTime) <= metricsStalenessThreshold
+	})
 	logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics))
 	if len(podMetrics) == 0 {
 		return
 
@@ -36,11 +36,12 @@ const (
 )
 
 type podMetrics struct {
-	pod      atomic.Pointer[backend.Pod]
-	metrics  atomic.Pointer[MetricsState]
-	pmc      PodMetricsClient
-	ds       Datastore
-	interval time.Duration
+	pod                atomic.Pointer[backend.Pod]
+	metrics            atomic.Pointer[MetricsState]
+	pmc                PodMetricsClient
+	ds                 Datastore
+	interval           time.Duration
+	stalenessThreshold time.Duration
 
 	startOnce sync.Once // ensures the refresh loop goroutine is started only once
 	stopOnce  sync.Once // ensures the done channel is closed only once
@@ -69,6 +70,10 @@ func (pm *podMetrics) UpdatePod(pod *corev1.Pod) {
 	pm.pod.Store(toInternalPod(pod))
 }
 
+func (pm *podMetrics) GetMetricsStalenessThreshold() time.Duration {
+	return pm.stalenessThreshold
+}
+
 func toInternalPod(pod *corev1.Pod) *backend.Pod {
 	labels := make(map[string]string, len(pod.GetLabels()))
 	for key, value := range pod.GetLabels() {
 
@@ -32,26 +32,27 @@ func NewPodMetricsFactory(pmc PodMetricsClient, refreshMetricsInterval, metricsS
 	return &PodMetricsFactory{
 		pmc:                       pmc,
 		refreshMetricsInterval:    refreshMetricsInterval,
-		MetricsStalenessThreshold: metricsStalenessThreshold,
+		metricsStalenessThreshold: metricsStalenessThreshold,
 	}
 }
 
 type PodMetricsFactory struct {
 	pmc                       PodMetricsClient
 	refreshMetricsInterval    time.Duration
-	MetricsStalenessThreshold time.Duration
+	metricsStalenessThreshold time.Duration
 }
 
 func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics {
 	pod := toInternalPod(in)
 	pm := &podMetrics{
-		pmc:       f.pmc,
-		ds:        ds,
-		interval:  f.refreshMetricsInterval,
-		startOnce: sync.Once{},
-		stopOnce:  sync.Once{},
-		done:      make(chan struct{}),
-		logger:    log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
+		pmc:                f.pmc,
+		ds:                 ds,
+		interval:           f.refreshMetricsInterval,
+		stalenessThreshold: f.metricsStalenessThreshold,
+		startOnce:          sync.Once{},
+		stopOnce:           sync.Once{},
+		done:               make(chan struct{}),
+		logger:             log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
 	}
 	pm.pod.Store(pod)
 	pm.metrics.Store(NewMetricsState())
@@ -63,7 +64,12 @@ func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.
 type PodMetrics interface {
 	GetPod() *backend.Pod
 	GetMetrics() *MetricsState
+	GetMetricsStalenessThreshold() time.Duration
 	UpdatePod(*corev1.Pod)
 	StopRefreshLoop()
 	String() string
 }
+
+func FreshMetricsFn(pm PodMetrics) bool {
+	return time.Since(pm.GetMetrics().UpdateTime) <= pm.GetMetricsStalenessThreshold()
+}
@@ -29,7 +29,16 @@ const (
 	DefaultQueueThresholdCritical = 5
 	// DefaultMetricsStalenessThreshold defines how old metrics can be before they
 	// are considered stale.
-	// Given the pod metrics refresh interval is 50ms, a threshold slightly above
-	// that should be fine.
-	DefaultMetricsStalenessThreshold = 200 * time.Millisecond
+	// The staleness is determined by the refresh internal plus the latency of the metrics API.
+	// To be on the safer side, we start with a larger threshold.
+	DefaultMetricsStalenessThreshold                = 2 * time.Second                  // default for --metricsStalenessThreshold
+	DefaultGrpcPort                                 = 9002                             // default for --grpcPort
+	DefaultDestinationEndpointHintMetadataNamespace = "envoy.lb"                       // default for --destinationEndpointHintMetadataNamespace
+	DefaultDestinationEndpointHintKey               = "x-gateway-destination-endpoint" // default for --destinationEndpointHintKey
+	DefaultPoolName                                 = ""                               // required but no default
+	DefaultPoolNamespace                            = "default"                        // default for --poolNamespace
+	DefaultRefreshMetricsInterval                   = 50 * time.Millisecond            // default for --refreshMetricsInterval
+	DefaultRefreshPrometheusMetricsInterval         = 5 * time.Second                  // default for --refreshPrometheusMetricsInterval
+	DefaultSecureServing                            = true                             // default for --secureServing
+	DefaultHealthChecking                           = false                            // default for --healthChecking
 )
@@ -22,7 +22,6 @@ import (
 	"fmt"
 	"reflect"
 	"sync"
-	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/labels"
@@ -64,7 +63,7 @@ type Datastore interface {
 	ModelGetAll() []*v1alpha2.InferenceModel
 
 	// PodMetrics operations
-	// PodGetAll returns all pods and metrics, including fresh and stale.
+	// PodGetAll returns all pods with stale and fresh metrics, only for testing.
 	PodGetAll() []backendmetrics.PodMetrics
 	// PodList lists pods matching the given predicate.
 	PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics
@@ -95,9 +94,8 @@ type datastore struct {
 	// key: InferenceModel.Spec.ModelName, value: *InferenceModel
 	models map[string]*v1alpha2.InferenceModel
 	// key: types.NamespacedName, value: backendmetrics.PodMetrics
-	pods                      *sync.Map
-	pmf                       *backendmetrics.PodMetricsFactory
-	MetricsStalenessThreshold time.Duration
+	pods *sync.Map
+	pmf  *backendmetrics.PodMetricsFactory
 }
 
 func (ds *datastore) Clear() {
@@ -249,9 +247,7 @@ func (ds *datastore) ModelGetAll() []*v1alpha2.InferenceModel {
 // /// Pods/endpoints APIs ///
 
 func (ds *datastore) PodGetAll() []backendmetrics.PodMetrics {
-	return ds.PodList(func(pm backendmetrics.PodMetrics) bool {
-		return time.Since(pm.GetMetrics().UpdateTime) <= ds.pmf.MetricsStalenessThreshold
-	})
+	return ds.PodList(func(backendmetrics.PodMetrics) bool { return true })
 }
 
 func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics {
 
@@ -265,6 +265,12 @@ var (
 		},
 		WaitingModels: map[string]int{},
 	}
+	pod3 = &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "pod3",
+		},
+	}
+
 	pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace}
 	pod2NamespacedName = types.NamespacedName{Name: pod2.Name, Namespace: pod2.Namespace}
 	inferencePool      = &v1.InferencePool{
@@ -314,17 +320,19 @@ func TestMetrics(t *testing.T) {
 				},
 			},
 			storePods: []*corev1.Pod{pod1, pod2},
-			want: []*backendmetrics.MetricsState{
-				pod1Metrics,
-				// Failed to fetch pod2 metrics so it remains the default values.
-				{
-					ActiveModels:        map[string]int{},
-					WaitingModels:       map[string]int{},
-					WaitingQueueSize:    0,
-					KVCacheUsagePercent: 0,
-					MaxActiveModels:     0,
+			want:      []*backendmetrics.MetricsState{pod1Metrics},
+		},
+		{
+			name: "Filter stale metrics",
+			pmc: &backendmetrics.FakePodMetricsClient{
+				Res: map[types.NamespacedName]*backendmetrics.MetricsState{
+					pod1NamespacedName: pod1Metrics,
+					pod2NamespacedName: pod2Metrics,
 				},
 			},
+			storePods: []*corev1.Pod{pod1, pod2, pod3},
+			want:      []*backendmetrics.MetricsState{pod1Metrics, pod2Metrics}, // pod3 metrics were stale and should not be included.
+
 		},
 	}
 
@@ -344,10 +352,9 @@ func TestMetrics(t *testing.T) {
 			for _, pod := range test.storePods {
 				ds.PodUpdateOrAddIfNotExist(pod)
 			}
+			time.Sleep(1 * time.Second) // Give some time for the metrics to be fetched.
 			assert.EventuallyWithT(t, func(t *assert.CollectT) {
-				got := ds.PodList(func(backendmetrics.PodMetrics) bool {
-					return true
-				})
+				got := ds.PodList(backendmetrics.FreshMetricsFn)
 				metrics := []*backendmetrics.MetricsState{}
 				for _, one := range got {
 					metrics = append(metrics, one.GetMetrics())
 
@@ -20,6 +20,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	compbasemetrics "k8s.io/component-base/metrics"
 
+	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics"
 )
@@ -62,7 +63,7 @@ func (c *inferencePoolMetricsCollector) Collect(ch chan<- prometheus.Metric) {
 		return
 	}
 
-	podMetrics := c.ds.PodGetAll()
+	podMetrics := c.ds.PodList(backendmetrics.FreshMetricsFn)
 	if len(podMetrics) == 0 {
 		return
 	}
 
@@ -532,7 +532,7 @@ func TestGetCandidatePodsForScheduling(t *testing.T) {
 		},
 	}
 
-	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second)
+	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second, config.DefaultMetricsStalenessThreshold)
 	ds := datastore.NewDatastore(t.Context(), pmf)
 	for _, testPod := range testInput {
 		ds.PodUpdateOrAddIfNotExist(testPod)
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ import (`
`20`	`20`	`"github.com/prometheus/client_golang/prometheus"`
`21`	`21`	`compbasemetrics "k8s.io/component-base/metrics"`
`22`	`22`
	`23`	`+ backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"`
`23`	`24`	`"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"`
`24`	`25`	`metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics"`
`25`	`26`	`)`
`@@ -62,7 +63,7 @@ func (c *inferencePoolMetricsCollector) Collect(ch chan<- prometheus.Metric) {`
`62`	`63`	`return`
`63`	`64`	`}`
`64`	`65`
`65`		`- podMetrics := c.ds.PodGetAll()`
	`66`	`+ podMetrics := c.ds.PodList(backendmetrics.FreshMetricsFn)`
`66`	`67`	`if len(podMetrics) == 0 {`
`67`	`68`	`return`
`68`	`69`	`}`
Original file line number	Diff line number	Diff line change
`@@ -532,7 +532,7 @@ func TestGetCandidatePodsForScheduling(t *testing.T) {`
`532`	`532`	`},`
`533`	`533`	`}`
`534`	`534`
`535`		`- pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second)`
	`535`	`+ pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second, config.DefaultMetricsStalenessThreshold)`
`536`	`536`	`ds := datastore.NewDatastore(t.Context(), pmf)`
`537`	`537`	`for _, testPod := range testInput {`
`538`	`538`	`ds.PodUpdateOrAddIfNotExist(testPod)`