From 02faef858fb85bae82f3751402162a31a05f7905 Mon Sep 17 00:00:00 2001 From: Nikos Angelopoulos Date: Wed, 24 Sep 2025 18:02:47 +0200 Subject: [PATCH 1/6] ruler: add optional failure classifier hook Signed-off-by: Nikos Angelopoulos --- rules/group.go | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/rules/group.go b/rules/group.go index 01ec2ecec9..ac69ce4396 100644 --- a/rules/group.go +++ b/rules/group.go @@ -74,6 +74,8 @@ type Group struct { // defaults to DefaultEvalIterationFunc. evalIterationFunc GroupEvalIterationFunc + failureClassifierFunc EvaluationFailureClassifierFunc + appOpts *storage.AppendOptions alignEvaluationTimeOnInterval bool } @@ -85,6 +87,8 @@ type Group struct { // DefaultEvalIterationFunc is the default implementation. type GroupEvalIterationFunc func(ctx context.Context, g *Group, evalTimestamp time.Time) +type EvaluationFailureClassifierFunc func(error) bool + type GroupOptions struct { Name, File string Interval time.Duration @@ -97,6 +101,14 @@ type GroupOptions struct { done chan struct{} EvalIterationFunc GroupEvalIterationFunc AlignEvaluationTimeOnInterval bool + FailureClassifierFunc EvaluationFailureClassifierFunc +} + +// DefaultEvaluationFailureClassifierFunc is the default implementation of +// EvaluationFailureClassifierFunc that classifies no errors as operator-controllable. +// Custom implementations can be provided to classify specific error types. +func DefaultEvaluationFailureClassifierFunc(_ error) bool { + return false } // NewGroup makes a new Group with the given name, options, and rules. @@ -115,6 +127,7 @@ func NewGroup(o GroupOptions) *Group { metrics.IterationsScheduled.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key) metrics.EvalFailures.WithLabelValues(key) + metrics.EvalFilteredFailures.WithLabelValues(key) metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key) @@ -127,6 +140,11 @@ func NewGroup(o GroupOptions) *Group { evalIterationFunc = DefaultEvalIterationFunc } + failureClassifierFunc := o.FailureClassifierFunc + if failureClassifierFunc == nil { + failureClassifierFunc = DefaultEvaluationFailureClassifierFunc + } + if opts.Logger == nil { opts.Logger = promslog.NewNopLogger() } @@ -150,6 +168,7 @@ func NewGroup(o GroupOptions) *Group { evalIterationFunc: evalIterationFunc, appOpts: &storage.AppendOptions{DiscardOutOfOrder: true}, alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval, + failureClassifierFunc: failureClassifierFunc, } } @@ -548,6 +567,9 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { sp.SetStatus(codes.Error, err.Error()) g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + if g.failureClassifierFunc != nil && g.failureClassifierFunc(err) { + g.metrics.EvalFilteredFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + } // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. var eqc promql.ErrQueryCanceled @@ -578,6 +600,10 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { sp.SetStatus(codes.Error, err.Error()) g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + if g.failureClassifierFunc != nil && g.failureClassifierFunc(err) { + g.metrics.EvalFilteredFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + } + logger.Warn("Rule sample appending failed", "err", err) return } @@ -954,6 +980,7 @@ type Metrics struct { IterationsScheduled *prometheus.CounterVec EvalTotal *prometheus.CounterVec EvalFailures *prometheus.CounterVec + EvalFilteredFailures *prometheus.CounterVec GroupInterval *prometheus.GaugeVec GroupLastEvalTime *prometheus.GaugeVec GroupLastDuration *prometheus.GaugeVec @@ -1012,6 +1039,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { }, []string{"rule_group"}, ), + EvalFilteredFailures: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "rule_evaluation_filtered_failures_total", + Help: "The total number of rule evaluation failures classified by the failure classifier function.", + }, + []string{"rule_group"}, + ), GroupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, @@ -1078,6 +1113,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { m.IterationsScheduled, m.EvalTotal, m.EvalFailures, + m.EvalFilteredFailures, m.GroupInterval, m.GroupLastEvalTime, m.GroupLastDuration, From a02ff2ae23d237f916ba2913443e1d3f522cc45b Mon Sep 17 00:00:00 2001 From: Nikos Angelopoulos Date: Wed, 24 Sep 2025 18:03:41 +0200 Subject: [PATCH 2/6] ruler: read evaluation test with customFailureClasiffier --- rules/group_test.go | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/rules/group_test.go b/rules/group_test.go index c5c812b168..076361d1e6 100644 --- a/rules/group_test.go +++ b/rules/group_test.go @@ -14,13 +14,20 @@ package rules import ( + "context" + "fmt" + "strings" "testing" "time" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/common/promslog" "github.com/stretchr/testify/require" "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" + "github.com/prometheus/prometheus/util/teststorage" ) func TestGroup_Equals(t *testing.T) { @@ -248,6 +255,75 @@ func TestGroup_Equals(t *testing.T) { } } +func TestEvalFilteredFailures(t *testing.T) { + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + + expr, err := parser.ParseExpr("up") + require.NoError(t, err) + rule := NewRecordingRule("test_rule", expr, labels.EmptyLabels()) + + // Custom classifier filters 429 and 5xx status codes + customClassifier := func(err error) bool { + if err == nil { + return false + } + errMsg := err.Error() + return strings.Contains(errMsg, "429") || strings.Contains(errMsg, "50") + } + + testCases := []struct { + name string + errorMessage string + classifier func(error) bool + expectFiltered bool + }{ + {"default classifier", "any error", nil, false}, + {"custom 429 filtered", "HTTP 429 Too Many Requests", customClassifier, true}, + {"custom 500 filtered", "HTTP 500 Internal Server Error", customClassifier, true}, + {"custom 502 filtered", "HTTP 502 Bad Gateway", customClassifier, true}, + {"custom 400 not filtered", "HTTP 400 Bad Request", customClassifier, false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + errorQueryFunc := func(_ context.Context, _ string, _ time.Time) (promql.Vector, error) { + return nil, fmt.Errorf("%s", tc.errorMessage) + } + + opts := &ManagerOptions{ + Context: context.Background(), + QueryFunc: errorQueryFunc, + Appendable: storage, + Queryable: storage, + Logger: promslog.NewNopLogger(), + } + + group := NewGroup(GroupOptions{ + Name: "test_group", + File: "test.yml", + Interval: time.Second, + Rules: []Rule{rule}, + Opts: opts, + FailureClassifierFunc: tc.classifier, + }) + + group.Eval(context.Background(), time.Now()) + + groupKey := GroupKey("test.yml", "test_group") + evalFailures := testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey)) + evalFilteredFailures := testutil.ToFloat64(group.metrics.EvalFilteredFailures.WithLabelValues(groupKey)) + + require.Equal(t, float64(1), evalFailures) + if tc.expectFiltered { + require.Equal(t, float64(1), evalFilteredFailures) + } else { + require.Equal(t, float64(0), evalFilteredFailures) + } + }) + } +} + func pointerOf[T any](value T) *T { return &value } From 951a45758e00030c771ef028880692f735f3a8fe Mon Sep 17 00:00:00 2001 From: Nikos Angelopoulos Date: Thu, 25 Sep 2025 10:47:25 +0200 Subject: [PATCH 3/6] test(ruler/group): add TestEvalDiscardedSamplesDoNotIncrementFailureMetrics Signed-off-by: Nikos Angelopoulos --- rules/group_test.go | 74 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/rules/group_test.go b/rules/group_test.go index 076361d1e6..a01bc353b8 100644 --- a/rules/group_test.go +++ b/rules/group_test.go @@ -324,6 +324,80 @@ func TestEvalFilteredFailures(t *testing.T) { } } +func TestEvalDiscardedSamplesDoNotIncrementFailureMetrics(t *testing.T) { + testCases := []struct { + name string + setupStorage func(storage *teststorage.TestStorage) + offsetMs int64 // milliseconds offset from evaluation time + }{ + { + name: "out of order samples", + setupStorage: func(s *teststorage.TestStorage) { + app := s.Appender(context.Background()) + app.Append(0, labels.FromStrings("__name__", "test_metric", "job", "test"), time.Now().UnixMilli(), 1.0) + app.Commit() + }, + offsetMs: -10000, // 10 seconds in past + }, + { + name: "too old samples", + setupStorage: func(_ *teststorage.TestStorage) {}, + offsetMs: -86400000, // 24 hours in past + }, + { + name: "duplicate samples", + setupStorage: func(s *teststorage.TestStorage) { + app := s.Appender(context.Background()) + app.Append(0, labels.FromStrings("__name__", "test_metric", "job", "test"), time.Now().UnixMilli(), 1.0) + app.Commit() + }, + offsetMs: 0, // Same timestamp, different value + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + + tc.setupStorage(storage) + + expr, err := parser.ParseExpr("up") + require.NoError(t, err) + rule := NewRecordingRule("test_rule", expr, labels.EmptyLabels()) + + queryFunc := func(_ context.Context, _ string, ts time.Time) (promql.Vector, error) { + return promql.Vector{ + promql.Sample{ + Metric: labels.FromStrings("__name__", "test_metric", "job", "test"), + T: ts.UnixMilli() + tc.offsetMs, + F: 2.0, // Different value for duplicate case + }, + }, nil + } + + group := NewGroup(GroupOptions{ + Name: "test_group", + File: "test.yml", + Interval: time.Second, + Rules: []Rule{rule}, + Opts: &ManagerOptions{ + Context: context.Background(), + QueryFunc: queryFunc, + Appendable: storage, + Queryable: storage, + Logger: promslog.NewNopLogger(), + }, + }) + + group.Eval(context.Background(), time.Now()) + + require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(GroupKey("test.yml", "test_group")))) + require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFilteredFailures.WithLabelValues(GroupKey("test.yml", "test_group")))) + }) + } +} + func pointerOf[T any](value T) *T { return &value } From 814e8bfc808e54f4584cfc357ba589688debc3b9 Mon Sep 17 00:00:00 2001 From: Nikos Angelopoulos Date: Tue, 7 Oct 2025 14:00:07 +0200 Subject: [PATCH 4/6] refactor: use an interface for the classifier and make naming more explicit to the use case Signed-off-by: Nikos Angelopoulos --- rules/group.go | 130 +++++++++++++++++++++++--------------------- rules/group_test.go | 61 +++++++++++---------- 2 files changed, 100 insertions(+), 91 deletions(-) diff --git a/rules/group.go b/rules/group.go index ac69ce4396..f3324b0149 100644 --- a/rules/group.go +++ b/rules/group.go @@ -74,7 +74,7 @@ type Group struct { // defaults to DefaultEvalIterationFunc. evalIterationFunc GroupEvalIterationFunc - failureClassifierFunc EvaluationFailureClassifierFunc + operatorControllableErrorClassifier OperatorControllableErrorClassifier appOpts *storage.AppendOptions alignEvaluationTimeOnInterval bool @@ -87,27 +87,31 @@ type Group struct { // DefaultEvalIterationFunc is the default implementation. type GroupEvalIterationFunc func(ctx context.Context, g *Group, evalTimestamp time.Time) -type EvaluationFailureClassifierFunc func(error) bool +// OperatorControllableErrorClassifier classifies whether rule evaluation errors are operator-controllable. +type OperatorControllableErrorClassifier interface { + IsOperatorControllable(error) bool +} type GroupOptions struct { - Name, File string - Interval time.Duration - Limit int - Rules []Rule - SourceTenants []string - ShouldRestore bool - Opts *ManagerOptions - QueryOffset *time.Duration - done chan struct{} - EvalIterationFunc GroupEvalIterationFunc - AlignEvaluationTimeOnInterval bool - FailureClassifierFunc EvaluationFailureClassifierFunc + Name, File string + Interval time.Duration + Limit int + Rules []Rule + SourceTenants []string + ShouldRestore bool + Opts *ManagerOptions + QueryOffset *time.Duration + done chan struct{} + EvalIterationFunc GroupEvalIterationFunc + AlignEvaluationTimeOnInterval bool + OperatorControllableErrorClassifier OperatorControllableErrorClassifier } -// DefaultEvaluationFailureClassifierFunc is the default implementation of -// EvaluationFailureClassifierFunc that classifies no errors as operator-controllable. -// Custom implementations can be provided to classify specific error types. -func DefaultEvaluationFailureClassifierFunc(_ error) bool { +// DefaultOperatorControllableErrorClassifier is the default implementation of +// OperatorControllableErrorClassifier that classifies no errors as operator-controllable. +type DefaultOperatorControllableErrorClassifier struct{} + +func (*DefaultOperatorControllableErrorClassifier) IsOperatorControllable(_ error) bool { return false } @@ -127,7 +131,7 @@ func NewGroup(o GroupOptions) *Group { metrics.IterationsScheduled.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key) metrics.EvalFailures.WithLabelValues(key) - metrics.EvalFilteredFailures.WithLabelValues(key) + metrics.EvalOperatorControllableFailures.WithLabelValues(key) metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key) @@ -140,9 +144,9 @@ func NewGroup(o GroupOptions) *Group { evalIterationFunc = DefaultEvalIterationFunc } - failureClassifierFunc := o.FailureClassifierFunc - if failureClassifierFunc == nil { - failureClassifierFunc = DefaultEvaluationFailureClassifierFunc + operatorControllableErrorClassifier := o.OperatorControllableErrorClassifier + if operatorControllableErrorClassifier == nil { + operatorControllableErrorClassifier = &DefaultOperatorControllableErrorClassifier{} } if opts.Logger == nil { @@ -150,25 +154,25 @@ func NewGroup(o GroupOptions) *Group { } return &Group{ - name: o.Name, - file: o.File, - interval: o.Interval, - queryOffset: o.QueryOffset, - limit: o.Limit, - rules: o.Rules, - shouldRestore: o.ShouldRestore, - opts: opts, - sourceTenants: o.SourceTenants, - seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), - done: make(chan struct{}), - managerDone: o.done, - terminated: make(chan struct{}), - logger: opts.Logger.With("file", o.File, "group", o.Name), - metrics: metrics, - evalIterationFunc: evalIterationFunc, - appOpts: &storage.AppendOptions{DiscardOutOfOrder: true}, - alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval, - failureClassifierFunc: failureClassifierFunc, + name: o.Name, + file: o.File, + interval: o.Interval, + queryOffset: o.QueryOffset, + limit: o.Limit, + rules: o.Rules, + shouldRestore: o.ShouldRestore, + opts: opts, + sourceTenants: o.SourceTenants, + seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), + done: make(chan struct{}), + managerDone: o.done, + terminated: make(chan struct{}), + logger: opts.Logger.With("file", o.File, "group", o.Name), + metrics: metrics, + evalIterationFunc: evalIterationFunc, + appOpts: &storage.AppendOptions{DiscardOutOfOrder: true}, + alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval, + operatorControllableErrorClassifier: operatorControllableErrorClassifier, } } @@ -567,8 +571,8 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { sp.SetStatus(codes.Error, err.Error()) g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - if g.failureClassifierFunc != nil && g.failureClassifierFunc(err) { - g.metrics.EvalFilteredFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { + g.metrics.EvalOperatorControllableFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() } // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. @@ -600,8 +604,8 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { sp.SetStatus(codes.Error, err.Error()) g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - if g.failureClassifierFunc != nil && g.failureClassifierFunc(err) { - g.metrics.EvalFilteredFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { + g.metrics.EvalOperatorControllableFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() } logger.Warn("Rule sample appending failed", "err", err) @@ -974,20 +978,20 @@ const namespace = "prometheus" // Metrics for rule evaluation. type Metrics struct { - EvalDuration prometheus.Summary - IterationDuration prometheus.Summary - IterationsMissed *prometheus.CounterVec - IterationsScheduled *prometheus.CounterVec - EvalTotal *prometheus.CounterVec - EvalFailures *prometheus.CounterVec - EvalFilteredFailures *prometheus.CounterVec - GroupInterval *prometheus.GaugeVec - GroupLastEvalTime *prometheus.GaugeVec - GroupLastDuration *prometheus.GaugeVec - GroupLastRuleDurationSum *prometheus.GaugeVec - GroupLastRestoreDuration *prometheus.GaugeVec - GroupRules *prometheus.GaugeVec - GroupSamples *prometheus.GaugeVec + EvalDuration prometheus.Summary + IterationDuration prometheus.Summary + IterationsMissed *prometheus.CounterVec + IterationsScheduled *prometheus.CounterVec + EvalTotal *prometheus.CounterVec + EvalFailures *prometheus.CounterVec + EvalOperatorControllableFailures *prometheus.CounterVec + GroupInterval *prometheus.GaugeVec + GroupLastEvalTime *prometheus.GaugeVec + GroupLastDuration *prometheus.GaugeVec + GroupLastRuleDurationSum *prometheus.GaugeVec + GroupLastRestoreDuration *prometheus.GaugeVec + GroupRules *prometheus.GaugeVec + GroupSamples *prometheus.GaugeVec } // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, @@ -1039,11 +1043,11 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { }, []string{"rule_group"}, ), - EvalFilteredFailures: prometheus.NewCounterVec( + EvalOperatorControllableFailures: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, - Name: "rule_evaluation_filtered_failures_total", - Help: "The total number of rule evaluation failures classified by the failure classifier function.", + Name: "rule_evaluation_operator_controllable_failures_total", + Help: "The total number of rule evaluation failures that are operator-controllable.", }, []string{"rule_group"}, ), @@ -1113,7 +1117,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { m.IterationsScheduled, m.EvalTotal, m.EvalFailures, - m.EvalFilteredFailures, + m.EvalOperatorControllableFailures, m.GroupInterval, m.GroupLastEvalTime, m.GroupLastDuration, diff --git a/rules/group_test.go b/rules/group_test.go index a01bc353b8..0ef0a6ede8 100644 --- a/rules/group_test.go +++ b/rules/group_test.go @@ -255,7 +255,19 @@ func TestGroup_Equals(t *testing.T) { } } -func TestEvalFilteredFailures(t *testing.T) { +// HTTPStatusOperatorControllableErrorClassifier is a test classifier that identifies +// 429 and 5xx status codes as operator-controllable errors. +type HTTPStatusOperatorControllableErrorClassifier struct{} + +func (*HTTPStatusOperatorControllableErrorClassifier) IsOperatorControllable(err error) bool { + if err == nil { + return false + } + errMsg := err.Error() + return strings.Contains(errMsg, "429") || strings.Contains(errMsg, "50") +} + +func TestEvalOperatorControllableFailures(t *testing.T) { storage := teststorage.New(t) t.Cleanup(func() { storage.Close() }) @@ -263,26 +275,19 @@ func TestEvalFilteredFailures(t *testing.T) { require.NoError(t, err) rule := NewRecordingRule("test_rule", expr, labels.EmptyLabels()) - // Custom classifier filters 429 and 5xx status codes - customClassifier := func(err error) bool { - if err == nil { - return false - } - errMsg := err.Error() - return strings.Contains(errMsg, "429") || strings.Contains(errMsg, "50") - } + customClassifier := &HTTPStatusOperatorControllableErrorClassifier{} testCases := []struct { - name string - errorMessage string - classifier func(error) bool - expectFiltered bool + name string + errorMessage string + classifier OperatorControllableErrorClassifier + expectOperatorControllable bool }{ {"default classifier", "any error", nil, false}, - {"custom 429 filtered", "HTTP 429 Too Many Requests", customClassifier, true}, - {"custom 500 filtered", "HTTP 500 Internal Server Error", customClassifier, true}, - {"custom 502 filtered", "HTTP 502 Bad Gateway", customClassifier, true}, - {"custom 400 not filtered", "HTTP 400 Bad Request", customClassifier, false}, + {"custom 429 classified as operator controllable", "HTTP 429 Too Many Requests", customClassifier, true}, + {"custom 500 classified as operator controllable", "HTTP 500 Internal Server Error", customClassifier, true}, + {"custom 502 classified as operator controllable", "HTTP 502 Bad Gateway", customClassifier, true}, + {"custom 400 not operator controllable", "HTTP 400 Bad Request", customClassifier, false}, } for _, tc := range testCases { @@ -300,25 +305,25 @@ func TestEvalFilteredFailures(t *testing.T) { } group := NewGroup(GroupOptions{ - Name: "test_group", - File: "test.yml", - Interval: time.Second, - Rules: []Rule{rule}, - Opts: opts, - FailureClassifierFunc: tc.classifier, + Name: "test_group", + File: "test.yml", + Interval: time.Second, + Rules: []Rule{rule}, + Opts: opts, + OperatorControllableErrorClassifier: tc.classifier, }) group.Eval(context.Background(), time.Now()) groupKey := GroupKey("test.yml", "test_group") evalFailures := testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey)) - evalFilteredFailures := testutil.ToFloat64(group.metrics.EvalFilteredFailures.WithLabelValues(groupKey)) + evalOperatorControllableFailures := testutil.ToFloat64(group.metrics.EvalOperatorControllableFailures.WithLabelValues(groupKey)) require.Equal(t, float64(1), evalFailures) - if tc.expectFiltered { - require.Equal(t, float64(1), evalFilteredFailures) + if tc.expectOperatorControllable { + require.Equal(t, float64(1), evalOperatorControllableFailures) } else { - require.Equal(t, float64(0), evalFilteredFailures) + require.Equal(t, float64(0), evalOperatorControllableFailures) } }) } @@ -393,7 +398,7 @@ func TestEvalDiscardedSamplesDoNotIncrementFailureMetrics(t *testing.T) { group.Eval(context.Background(), time.Now()) require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(GroupKey("test.yml", "test_group")))) - require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFilteredFailures.WithLabelValues(GroupKey("test.yml", "test_group")))) + require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalOperatorControllableFailures.WithLabelValues(GroupKey("test.yml", "test_group")))) }) } } From a7f215689fc810a96002076b0d546a1ab8e5bf04 Mon Sep 17 00:00:00 2001 From: Nikos Angelopoulos Date: Wed, 8 Oct 2025 11:21:51 +0200 Subject: [PATCH 5/6] refactor: switch from a smetric to label cause --- rules/group.go | 61 ++++++++++++++++++------------------------- rules/group_test.go | 16 +++++++----- rules/manager.go | 3 ++- rules/manager_test.go | 6 ++--- 4 files changed, 40 insertions(+), 46 deletions(-) diff --git a/rules/group.go b/rules/group.go index f3324b0149..5e373dc6bb 100644 --- a/rules/group.go +++ b/rules/group.go @@ -130,8 +130,8 @@ func NewGroup(o GroupOptions) *Group { metrics.IterationsMissed.WithLabelValues(key) metrics.IterationsScheduled.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key) - metrics.EvalFailures.WithLabelValues(key) - metrics.EvalOperatorControllableFailures.WithLabelValues(key) + metrics.EvalFailures.WithLabelValues(key, "user") + metrics.EvalFailures.WithLabelValues(key, "operator") metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key) @@ -569,11 +569,8 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + g.incrementEvalFailures(err) - if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { - g.metrics.EvalOperatorControllableFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - } // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. var eqc promql.ErrQueryCanceled @@ -602,11 +599,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - - if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { - g.metrics.EvalOperatorControllableFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - } + g.incrementEvalFailures(err) logger.Warn("Rule sample appending failed", "err", err) return @@ -731,6 +724,14 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { g.cleanupStaleSeries(ctx, ts) } +func (g *Group) incrementEvalFailures(err error) { + cause := "user" + if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { + cause = "operator" + } + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), cause).Inc() +} + func (g *Group) QueryOffset() time.Duration { if g.queryOffset != nil { return *g.queryOffset @@ -978,20 +979,19 @@ const namespace = "prometheus" // Metrics for rule evaluation. type Metrics struct { - EvalDuration prometheus.Summary - IterationDuration prometheus.Summary - IterationsMissed *prometheus.CounterVec - IterationsScheduled *prometheus.CounterVec - EvalTotal *prometheus.CounterVec - EvalFailures *prometheus.CounterVec - EvalOperatorControllableFailures *prometheus.CounterVec - GroupInterval *prometheus.GaugeVec - GroupLastEvalTime *prometheus.GaugeVec - GroupLastDuration *prometheus.GaugeVec - GroupLastRuleDurationSum *prometheus.GaugeVec - GroupLastRestoreDuration *prometheus.GaugeVec - GroupRules *prometheus.GaugeVec - GroupSamples *prometheus.GaugeVec + EvalDuration prometheus.Summary + IterationDuration prometheus.Summary + IterationsMissed *prometheus.CounterVec + IterationsScheduled *prometheus.CounterVec + EvalTotal *prometheus.CounterVec + EvalFailures *prometheus.CounterVec + GroupInterval *prometheus.GaugeVec + GroupLastEvalTime *prometheus.GaugeVec + GroupLastDuration *prometheus.GaugeVec + GroupLastRuleDurationSum *prometheus.GaugeVec + GroupLastRestoreDuration *prometheus.GaugeVec + GroupRules *prometheus.GaugeVec + GroupSamples *prometheus.GaugeVec } // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, @@ -1041,15 +1041,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_evaluation_failures_total", Help: "The total number of rule evaluation failures.", }, - []string{"rule_group"}, - ), - EvalOperatorControllableFailures: prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "rule_evaluation_operator_controllable_failures_total", - Help: "The total number of rule evaluation failures that are operator-controllable.", - }, - []string{"rule_group"}, + []string{"rule_group", "cause"}, ), GroupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -1117,7 +1109,6 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { m.IterationsScheduled, m.EvalTotal, m.EvalFailures, - m.EvalOperatorControllableFailures, m.GroupInterval, m.GroupLastEvalTime, m.GroupLastDuration, diff --git a/rules/group_test.go b/rules/group_test.go index 0ef0a6ede8..fb7c4c8c84 100644 --- a/rules/group_test.go +++ b/rules/group_test.go @@ -316,14 +316,15 @@ func TestEvalOperatorControllableFailures(t *testing.T) { group.Eval(context.Background(), time.Now()) groupKey := GroupKey("test.yml", "test_group") - evalFailures := testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey)) - evalOperatorControllableFailures := testutil.ToFloat64(group.metrics.EvalOperatorControllableFailures.WithLabelValues(groupKey)) + evalUserFailures := testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "user")) + evalOperatorFailures := testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "operator")) - require.Equal(t, float64(1), evalFailures) if tc.expectOperatorControllable { - require.Equal(t, float64(1), evalOperatorControllableFailures) + require.Equal(t, float64(0), evalUserFailures) + require.Equal(t, float64(1), evalOperatorFailures) } else { - require.Equal(t, float64(0), evalOperatorControllableFailures) + require.Equal(t, float64(1), evalUserFailures) + require.Equal(t, float64(0), evalOperatorFailures) } }) } @@ -397,8 +398,9 @@ func TestEvalDiscardedSamplesDoNotIncrementFailureMetrics(t *testing.T) { group.Eval(context.Background(), time.Now()) - require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(GroupKey("test.yml", "test_group")))) - require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalOperatorControllableFailures.WithLabelValues(GroupKey("test.yml", "test_group")))) + groupKey := GroupKey("test.yml", "test_group") + require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "user"))) + require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "operator"))) }) } } diff --git a/rules/manager.go b/rules/manager.go index 49672a6db7..3a42a7e30a 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -295,7 +295,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels m.IterationsMissed.DeleteLabelValues(n) m.IterationsScheduled.DeleteLabelValues(n) m.EvalTotal.DeleteLabelValues(n) - m.EvalFailures.DeleteLabelValues(n) + m.EvalFailures.DeleteLabelValues(n, "user") + m.EvalFailures.DeleteLabelValues(n, "operator") m.GroupInterval.DeleteLabelValues(n) m.GroupLastEvalTime.DeleteLabelValues(n) m.GroupLastDuration.DeleteLabelValues(n) diff --git a/rules/manager_test.go b/rules/manager_test.go index 9b0a378212..e5d8a080dd 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -1196,11 +1196,11 @@ func TestMetricsUpdate(t *testing.T) { }{ { files: files, - metrics: 12, + metrics: 14, }, { files: files[:1], - metrics: 6, + metrics: 7, }, { files: files[:0], @@ -1208,7 +1208,7 @@ func TestMetricsUpdate(t *testing.T) { }, { files: files[1:], - metrics: 6, + metrics: 7, }, } From 1738ff17d2425efe8903b6b0b7d4f3633ea74378 Mon Sep 17 00:00:00 2001 From: Nikos Angelopoulos Date: Thu, 9 Oct 2025 18:33:49 +0200 Subject: [PATCH 6/6] change label name to reason --- rules/group.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rules/group.go b/rules/group.go index 5e373dc6bb..2d4a713ccc 100644 --- a/rules/group.go +++ b/rules/group.go @@ -725,11 +725,11 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { } func (g *Group) incrementEvalFailures(err error) { - cause := "user" + reason := "user" if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { - cause = "operator" + reason = "operator" } - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), cause).Inc() + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), reason).Inc() } func (g *Group) QueryOffset() time.Duration { @@ -1041,7 +1041,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_evaluation_failures_total", Help: "The total number of rule evaluation failures.", }, - []string{"rule_group", "cause"}, + []string{"rule_group", "reason"}, ), GroupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{