kubernetes-sigs
diff --git a/‎cmd/epp/runner/runner.go
Lines changed: 19 additions & 15 deletions b/‎cmd/epp/runner/runner.go
Lines changed: 19 additions & 15 deletions
diff --git a/‎pkg/epp/backend/metrics/fake.go
Lines changed: 6 additions & 0 deletions b/‎pkg/epp/backend/metrics/fake.go
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/epp/backend/metrics/logger.go
Lines changed: 10 additions & 11 deletions b/‎pkg/epp/backend/metrics/logger.go
Lines changed: 10 additions & 11 deletions
diff --git a/‎pkg/epp/backend/metrics/pod_metrics.go
Lines changed: 10 additions & 5 deletions b/‎pkg/epp/backend/metrics/pod_metrics.go
Lines changed: 10 additions & 5 deletions
diff --git a/‎pkg/epp/backend/metrics/types.go
Lines changed: 15 additions & 9 deletions b/‎pkg/epp/backend/metrics/types.go
Lines changed: 15 additions & 9 deletions
diff --git a/‎pkg/epp/common/config/defaults.go
Lines changed: 12 additions & 3 deletions b/‎pkg/epp/common/config/defaults.go
Lines changed: 12 additions & 3 deletions
diff --git a/‎pkg/epp/datastore/datastore.go
Lines changed: 4 additions & 8 deletions b/‎pkg/epp/datastore/datastore.go
Lines changed: 4 additions & 8 deletions
diff --git a/‎pkg/epp/datastore/datastore_test.go
Lines changed: 20 additions & 12 deletions b/‎pkg/epp/datastore/datastore_test.go
Lines changed: 20 additions & 12 deletions
diff --git a/‎pkg/epp/metrics/collectors/inference_pool.go
Lines changed: 2 additions & 1 deletion b/‎pkg/epp/metrics/collectors/inference_pool.go
Lines changed: 2 additions & 1 deletion
diff --git a/‎pkg/epp/requestcontrol/director_test.go
Lines changed: 1 addition & 1 deletion b/‎pkg/epp/requestcontrol/director_test.go
Lines changed: 1 addition & 1 deletion
@@ -39,6 +39,7 @@ import (
 	conformance_epp "sigs.k8s.io/gateway-api-inference-extension/conformance/testing-epp"
 	"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config/loader"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
@@ -59,7 +60,7 @@ import (
 var (
 	grpcPort = flag.Int(
 		"grpcPort",
-		runserver.DefaultGrpcPort,
+		config.DefaultGrpcPort,
 		"The gRPC port used for communicating with Envoy proxy")
 	grpcHealthPort = flag.Int(
 		"grpcHealthPort",
@@ -69,33 +70,26 @@ var (
 		"metricsPort", 9090, "The metrics port")
 	destinationEndpointHintKey = flag.String(
 		"destinationEndpointHintKey",
-		runserver.DefaultDestinationEndpointHintKey,
+		config.DefaultDestinationEndpointHintKey,
 		"Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.")
 	destinationEndpointHintMetadataNamespace = flag.String(
 		"DestinationEndpointHintMetadataNamespace",
-		runserver.DefaultDestinationEndpointHintMetadataNamespace,
+		config.DefaultDestinationEndpointHintMetadataNamespace,
 		"The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+
 			"target endpoint. If not set, then an outer namespace struct should not be created.")
 	poolName = flag.String(
 		"poolName",
-		runserver.DefaultPoolName,
+		config.DefaultPoolName,
 		"Name of the InferencePool this Endpoint Picker is associated with.")
 	poolNamespace = flag.String(
 		"poolNamespace",
-		runserver.DefaultPoolNamespace,
+		config.DefaultPoolNamespace,
 		"Namespace of the InferencePool this Endpoint Picker is associated with.")
-	refreshMetricsInterval = flag.Duration(
-		"refreshMetricsInterval",
-		runserver.DefaultRefreshMetricsInterval,
-		"interval to refresh metrics")
-	refreshPrometheusMetricsInterval = flag.Duration(
-		"refreshPrometheusMetricsInterval",
-		runserver.DefaultRefreshPrometheusMetricsInterval,
-		"interval to flush prometheus metrics")
+
 	logVerbosity  = flag.Int("v", logging.DEFAULT, "number for the log level verbosity")
 	secureServing = flag.Bool(
-		"secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.")
-	healthChecking = flag.Bool("healthChecking", runserver.DefaultHealthChecking, "Enables health checking")
+		"secureServing", config.DefaultSecureServing, "Enables secure serving. Defaults to true.")
+	healthChecking = flag.Bool("healthChecking", config.DefaultHealthChecking, "Enables health checking")
 	certPath       = flag.String(
 		"certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+
 			"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
@@ -111,6 +105,16 @@ var (
 	loraInfoMetric = flag.String("loraInfoMetric",
 		"vllm:lora_requests_info",
 		"Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
+
+	// metrics related flags
+	refreshMetricsInterval = flag.Duration(
+		"refreshMetricsInterval",
+		config.DefaultRefreshMetricsInterval,
+		"interval to refresh metrics")
+	refreshPrometheusMetricsInterval = flag.Duration(
+		"refreshPrometheusMetricsInterval",
+		config.DefaultRefreshPrometheusMetricsInterval,
+		"interval to flush prometheus metrics")
 	metricsStalenessThreshold = flag.Duration("metricsStalenessThreshold",
 		config.DefaultMetricsStalenessThreshold,
 		"Duration after which metrics are considered stale. This is used to determine if a pod's metrics "+
 
@@ -20,11 +20,13 @@ import (
 	"context"
 	"fmt"
 	"sync"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
@@ -49,6 +51,10 @@ func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) {
 }
 func (fpm *FakePodMetrics) StopRefreshLoop() {} // noop
 
+func (fpm *FakePodMetrics) GetMetricsStalenessThreshold() time.Duration {
+	return config.DefaultMetricsStalenessThreshold
+}
+
 type FakePodMetricsClient struct {
 	errMu sync.RWMutex
 	Err   map[types.NamespacedName]error
 
@@ -29,23 +29,20 @@ import (
 )
 
 const (
-	// Note currently the EPP treats stale metrics same as fresh.
-	// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/336
-	metricsValidityPeriod = 5 * time.Second
-	debugPrintInterval    = 5 * time.Second
+	debugPrintInterval = 5 * time.Second
 )
 
 type Datastore interface {
 	PoolGet() (*v1alpha2.InferencePool, error)
 	// PodMetrics operations
-	// PodGetAll returns all pods and metrics, including fresh and stale.
+	// PodGetAll returns all pods and metrics.
 	PodGetAll() []PodMetrics
 	PodList(func(PodMetrics) bool) []PodMetrics
 }
 
 // StartMetricsLogger starts goroutines to 1) Print metrics debug logs if the DEBUG log level is
 // enabled; 2) flushes Prometheus metrics about the backend servers.
-func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval time.Duration) {
+func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval, metricsStalenessThreshold time.Duration) {
 	logger := log.FromContext(ctx)
 	ticker := time.NewTicker(refreshPrometheusMetricsInterval)
 	go func() {
@@ -56,7 +53,7 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 				logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")
 				return
 			case <-ticker.C: // Periodically refresh prometheus metrics for inference pool
-				refreshPrometheusMetrics(logger, datastore)
+				refreshPrometheusMetrics(logger, datastore, metricsStalenessThreshold)
 			}
 		}
 	}()
@@ -73,10 +70,10 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 					return
 				case <-ticker.C:
 					podsWithFreshMetrics := datastore.PodList(func(pm PodMetrics) bool {
-						return time.Since(pm.GetMetrics().UpdateTime) <= metricsValidityPeriod
+						return time.Since(pm.GetMetrics().UpdateTime) <= metricsStalenessThreshold
 					})
 					podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool {
-						return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod
+						return time.Since(pm.GetMetrics().UpdateTime) > metricsStalenessThreshold
 					})
 					s := fmt.Sprintf("Current Pods and metrics gathered. Fresh metrics: %+v, Stale metrics: %+v", podsWithFreshMetrics, podsWithStaleMetrics)
 					logger.V(logutil.VERBOSE).Info(s)
@@ -86,7 +83,7 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
 	}
 }
 
-func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {
+func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore, metricsStalenessThreshold time.Duration) {
 	pool, err := datastore.PoolGet()
 	if err != nil {
 		// No inference pool or not initialize.
@@ -97,7 +94,9 @@ func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {
 	var kvCacheTotal float64
 	var queueTotal int
 
-	podMetrics := datastore.PodGetAll()
+	podMetrics := datastore.PodList(func(pm PodMetrics) bool {
+		return time.Since(pm.GetMetrics().UpdateTime) <= metricsStalenessThreshold
+	})
 	logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics))
 	if len(podMetrics) == 0 {
 		return
 
@@ -36,11 +36,12 @@ const (
 )
 
 type podMetrics struct {
-	pod      atomic.Pointer[backend.Pod]
-	metrics  atomic.Pointer[MetricsState]
-	pmc      PodMetricsClient
-	ds       Datastore
-	interval time.Duration
+	pod                atomic.Pointer[backend.Pod]
+	metrics            atomic.Pointer[MetricsState]
+	pmc                PodMetricsClient
+	ds                 Datastore
+	interval           time.Duration
+	stalenessThreshold time.Duration
 
 	startOnce sync.Once // ensures the refresh loop goroutine is started only once
 	stopOnce  sync.Once // ensures the done channel is closed only once
@@ -69,6 +70,10 @@ func (pm *podMetrics) UpdatePod(pod *corev1.Pod) {
 	pm.pod.Store(toInternalPod(pod))
 }
 
+func (pm *podMetrics) GetMetricsStalenessThreshold() time.Duration {
+	return pm.stalenessThreshold
+}
+
 func toInternalPod(pod *corev1.Pod) *backend.Pod {
 	labels := make(map[string]string, len(pod.GetLabels()))
 	for key, value := range pod.GetLabels() {
 
@@ -31,26 +31,27 @@ func NewPodMetricsFactory(pmc PodMetricsClient, refreshMetricsInterval, metricsS
 	return &PodMetricsFactory{
 		pmc:                       pmc,
 		refreshMetricsInterval:    refreshMetricsInterval,
-		MetricsStalenessThreshold: metricsStalenessThreshold,
+		metricsStalenessThreshold: metricsStalenessThreshold,
 	}
 }
 
 type PodMetricsFactory struct {
 	pmc                       PodMetricsClient
 	refreshMetricsInterval    time.Duration
-	MetricsStalenessThreshold time.Duration
+	metricsStalenessThreshold time.Duration
 }
 
 func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics {
 	pod := toInternalPod(in)
 	pm := &podMetrics{
-		pmc:       f.pmc,
-		ds:        ds,
-		interval:  f.refreshMetricsInterval,
-		startOnce: sync.Once{},
-		stopOnce:  sync.Once{},
-		done:      make(chan struct{}),
-		logger:    log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
+		pmc:                f.pmc,
+		ds:                 ds,
+		interval:           f.refreshMetricsInterval,
+		stalenessThreshold: f.metricsStalenessThreshold,
+		startOnce:          sync.Once{},
+		stopOnce:           sync.Once{},
+		done:               make(chan struct{}),
+		logger:             log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName),
 	}
 	pm.pod.Store(pod)
 	pm.metrics.Store(NewMetricsState())
@@ -62,7 +63,12 @@ func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.
 type PodMetrics interface {
 	GetPod() *backend.Pod
 	GetMetrics() *MetricsState
+	GetMetricsStalenessThreshold() time.Duration
 	UpdatePod(*corev1.Pod)
 	StopRefreshLoop()
 	String() string
 }
+
+func FreshMetricsFn(pm PodMetrics) bool {
+	return time.Since(pm.GetMetrics().UpdateTime) <= pm.GetMetricsStalenessThreshold()
+}
@@ -29,7 +29,16 @@ const (
 	DefaultQueueThresholdCritical = 5
 	// DefaultMetricsStalenessThreshold defines how old metrics can be before they
 	// are considered stale.
-	// Given the pod metrics refresh interval is 50ms, a threshold slightly above
-	// that should be fine.
-	DefaultMetricsStalenessThreshold = 200 * time.Millisecond
+	// The staleness is determined by the refresh internal plus the latency of the metrics API.
+	// To be on the safer side, we start with a larger threshold.
+	DefaultMetricsStalenessThreshold                = 2 * time.Second                  // default for --metricsStalenessThreshold
+	DefaultGrpcPort                                 = 9002                             // default for --grpcPort
+	DefaultDestinationEndpointHintMetadataNamespace = "envoy.lb"                       // default for --destinationEndpointHintMetadataNamespace
+	DefaultDestinationEndpointHintKey               = "x-gateway-destination-endpoint" // default for --destinationEndpointHintKey
+	DefaultPoolName                                 = ""                               // required but no default
+	DefaultPoolNamespace                            = "default"                        // default for --poolNamespace
+	DefaultRefreshMetricsInterval                   = 50 * time.Millisecond            // default for --refreshMetricsInterval
+	DefaultRefreshPrometheusMetricsInterval         = 5 * time.Second                  // default for --refreshPrometheusMetricsInterval
+	DefaultSecureServing                            = true                             // default for --secureServing
+	DefaultHealthChecking                           = false                            // default for --healthChecking
 )
@@ -22,7 +22,6 @@ import (
 	"fmt"
 	"reflect"
 	"sync"
-	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/labels"
@@ -62,7 +61,7 @@ type Datastore interface {
 	ModelGetAll() []*v1alpha2.InferenceModel
 
 	// PodMetrics operations
-	// PodGetAll returns all pods and metrics, including fresh and stale.
+	// PodGetAll returns all pods with stale and fresh metrics, only for testing.
 	PodGetAll() []backendmetrics.PodMetrics
 	// PodList lists pods matching the given predicate.
 	PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics
@@ -93,9 +92,8 @@ type datastore struct {
 	// key: InferenceModel.Spec.ModelName, value: *InferenceModel
 	models map[string]*v1alpha2.InferenceModel
 	// key: types.NamespacedName, value: backendmetrics.PodMetrics
-	pods                      *sync.Map
-	pmf                       *backendmetrics.PodMetricsFactory
-	MetricsStalenessThreshold time.Duration
+	pods *sync.Map
+	pmf  *backendmetrics.PodMetricsFactory
 }
 
 func (ds *datastore) Clear() {
@@ -247,9 +245,7 @@ func (ds *datastore) ModelGetAll() []*v1alpha2.InferenceModel {
 // /// Pods/endpoints APIs ///
 
 func (ds *datastore) PodGetAll() []backendmetrics.PodMetrics {
-	return ds.PodList(func(pm backendmetrics.PodMetrics) bool {
-		return time.Since(pm.GetMetrics().UpdateTime) <= ds.pmf.MetricsStalenessThreshold
-	})
+	return ds.PodList(func(backendmetrics.PodMetrics) bool { return true })
 }
 
 func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics {
 
@@ -31,6 +31,7 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+
 	"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config"
@@ -263,6 +264,12 @@ var (
 		},
 		WaitingModels: map[string]int{},
 	}
+	pod3 = &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "pod3",
+		},
+	}
+
 	pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace}
 	pod2NamespacedName = types.NamespacedName{Name: pod2.Name, Namespace: pod2.Namespace}
 	inferencePool      = &v1alpha2.InferencePool{
@@ -312,17 +319,19 @@ func TestMetrics(t *testing.T) {
 				},
 			},
 			storePods: []*corev1.Pod{pod1, pod2},
-			want: []*backendmetrics.MetricsState{
-				pod1Metrics,
-				// Failed to fetch pod2 metrics so it remains the default values.
-				{
-					ActiveModels:        map[string]int{},
-					WaitingModels:       map[string]int{},
-					WaitingQueueSize:    0,
-					KVCacheUsagePercent: 0,
-					MaxActiveModels:     0,
+			want:      []*backendmetrics.MetricsState{pod1Metrics},
+		},
+		{
+			name: "Filter stale metrics",
+			pmc: &backendmetrics.FakePodMetricsClient{
+				Res: map[types.NamespacedName]*backendmetrics.MetricsState{
+					pod1NamespacedName: pod1Metrics,
+					pod2NamespacedName: pod2Metrics,
 				},
 			},
+			storePods: []*corev1.Pod{pod1, pod2, pod3},
+			want:      []*backendmetrics.MetricsState{pod1Metrics, pod2Metrics}, // pod3 metrics were stale and should not be included.
+
 		},
 	}
 
@@ -342,10 +351,9 @@ func TestMetrics(t *testing.T) {
 			for _, pod := range test.storePods {
 				ds.PodUpdateOrAddIfNotExist(pod)
 			}
+			time.Sleep(1 * time.Second) // Give some time for the metrics to be fetched.
 			assert.EventuallyWithT(t, func(t *assert.CollectT) {
-				got := ds.PodList(func(backendmetrics.PodMetrics) bool {
-					return true
-				})
+				got := ds.PodList(backendmetrics.FreshMetricsFn)
 				metrics := []*backendmetrics.MetricsState{}
 				for _, one := range got {
 					metrics = append(metrics, one.GetMetrics())
 
@@ -20,6 +20,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	compbasemetrics "k8s.io/component-base/metrics"
 
+	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 	metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics"
 )
@@ -62,7 +63,7 @@ func (c *inferencePoolMetricsCollector) Collect(ch chan<- prometheus.Metric) {
 		return
 	}
 
-	podMetrics := c.ds.PodGetAll()
+	podMetrics := c.ds.PodList(backendmetrics.FreshMetricsFn)
 	if len(podMetrics) == 0 {
 		return
 	}
 
@@ -508,7 +508,7 @@ func TestGetCandidatePodsForScheduling(t *testing.T) {
 		},
 	}
 
-	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second)
+	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second, config.DefaultMetricsStalenessThreshold)
 	ds := datastore.NewDatastore(t.Context(), pmf)
 	for _, testPod := range testInput {
 		ds.PodUpdateOrAddIfNotExist(testPod)
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ import (`
`20`	`20`	`"github.com/prometheus/client_golang/prometheus"`
`21`	`21`	`compbasemetrics "k8s.io/component-base/metrics"`
`22`	`22`
	`23`	`+ backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"`
`23`	`24`	`"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"`
`24`	`25`	`metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics"`
`25`	`26`	`)`
`@@ -62,7 +63,7 @@ func (c *inferencePoolMetricsCollector) Collect(ch chan<- prometheus.Metric) {`
`62`	`63`	`return`
`63`	`64`	`}`
`64`	`65`
`65`		`- podMetrics := c.ds.PodGetAll()`
	`66`	`+ podMetrics := c.ds.PodList(backendmetrics.FreshMetricsFn)`
`66`	`67`	`if len(podMetrics) == 0 {`
`67`	`68`	`return`
`68`	`69`	`}`
Original file line number	Diff line number	Diff line change
`@@ -508,7 +508,7 @@ func TestGetCandidatePodsForScheduling(t *testing.T) {`
`508`	`508`	`},`
`509`	`509`	`}`
`510`	`510`
`511`		`- pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second)`
	`511`	`+ pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second, config.DefaultMetricsStalenessThreshold)`
`512`	`512`	`ds := datastore.NewDatastore(t.Context(), pmf)`
`513`	`513`	`for _, testPod := range testInput {`
`514`	`514`	`ds.PodUpdateOrAddIfNotExist(testPod)`