diff --git a/rules/group.go b/rules/group.go index 01ec2ecec..2d4a713cc 100644 --- a/rules/group.go +++ b/rules/group.go @@ -74,6 +74,8 @@ type Group struct { // defaults to DefaultEvalIterationFunc. evalIterationFunc GroupEvalIterationFunc + operatorControllableErrorClassifier OperatorControllableErrorClassifier + appOpts *storage.AppendOptions alignEvaluationTimeOnInterval bool } @@ -85,18 +87,32 @@ type Group struct { // DefaultEvalIterationFunc is the default implementation. type GroupEvalIterationFunc func(ctx context.Context, g *Group, evalTimestamp time.Time) +// OperatorControllableErrorClassifier classifies whether rule evaluation errors are operator-controllable. +type OperatorControllableErrorClassifier interface { + IsOperatorControllable(error) bool +} + type GroupOptions struct { - Name, File string - Interval time.Duration - Limit int - Rules []Rule - SourceTenants []string - ShouldRestore bool - Opts *ManagerOptions - QueryOffset *time.Duration - done chan struct{} - EvalIterationFunc GroupEvalIterationFunc - AlignEvaluationTimeOnInterval bool + Name, File string + Interval time.Duration + Limit int + Rules []Rule + SourceTenants []string + ShouldRestore bool + Opts *ManagerOptions + QueryOffset *time.Duration + done chan struct{} + EvalIterationFunc GroupEvalIterationFunc + AlignEvaluationTimeOnInterval bool + OperatorControllableErrorClassifier OperatorControllableErrorClassifier +} + +// DefaultOperatorControllableErrorClassifier is the default implementation of +// OperatorControllableErrorClassifier that classifies no errors as operator-controllable. +type DefaultOperatorControllableErrorClassifier struct{} + +func (*DefaultOperatorControllableErrorClassifier) IsOperatorControllable(_ error) bool { + return false } // NewGroup makes a new Group with the given name, options, and rules. @@ -114,7 +130,8 @@ func NewGroup(o GroupOptions) *Group { metrics.IterationsMissed.WithLabelValues(key) metrics.IterationsScheduled.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key) - metrics.EvalFailures.WithLabelValues(key) + metrics.EvalFailures.WithLabelValues(key, "user") + metrics.EvalFailures.WithLabelValues(key, "operator") metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key) @@ -127,29 +144,35 @@ func NewGroup(o GroupOptions) *Group { evalIterationFunc = DefaultEvalIterationFunc } + operatorControllableErrorClassifier := o.OperatorControllableErrorClassifier + if operatorControllableErrorClassifier == nil { + operatorControllableErrorClassifier = &DefaultOperatorControllableErrorClassifier{} + } + if opts.Logger == nil { opts.Logger = promslog.NewNopLogger() } return &Group{ - name: o.Name, - file: o.File, - interval: o.Interval, - queryOffset: o.QueryOffset, - limit: o.Limit, - rules: o.Rules, - shouldRestore: o.ShouldRestore, - opts: opts, - sourceTenants: o.SourceTenants, - seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), - done: make(chan struct{}), - managerDone: o.done, - terminated: make(chan struct{}), - logger: opts.Logger.With("file", o.File, "group", o.Name), - metrics: metrics, - evalIterationFunc: evalIterationFunc, - appOpts: &storage.AppendOptions{DiscardOutOfOrder: true}, - alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval, + name: o.Name, + file: o.File, + interval: o.Interval, + queryOffset: o.QueryOffset, + limit: o.Limit, + rules: o.Rules, + shouldRestore: o.ShouldRestore, + opts: opts, + sourceTenants: o.SourceTenants, + seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), + done: make(chan struct{}), + managerDone: o.done, + terminated: make(chan struct{}), + logger: opts.Logger.With("file", o.File, "group", o.Name), + metrics: metrics, + evalIterationFunc: evalIterationFunc, + appOpts: &storage.AppendOptions{DiscardOutOfOrder: true}, + alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval, + operatorControllableErrorClassifier: operatorControllableErrorClassifier, } } @@ -546,7 +569,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + g.incrementEvalFailures(err) // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. @@ -576,7 +599,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + g.incrementEvalFailures(err) logger.Warn("Rule sample appending failed", "err", err) return @@ -701,6 +724,14 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { g.cleanupStaleSeries(ctx, ts) } +func (g *Group) incrementEvalFailures(err error) { + reason := "user" + if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { + reason = "operator" + } + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), reason).Inc() +} + func (g *Group) QueryOffset() time.Duration { if g.queryOffset != nil { return *g.queryOffset @@ -1010,7 +1041,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_evaluation_failures_total", Help: "The total number of rule evaluation failures.", }, - []string{"rule_group"}, + []string{"rule_group", "reason"}, ), GroupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{ diff --git a/rules/group_test.go b/rules/group_test.go index c5c812b16..fb7c4c8c8 100644 --- a/rules/group_test.go +++ b/rules/group_test.go @@ -14,13 +14,20 @@ package rules import ( + "context" + "fmt" + "strings" "testing" "time" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/common/promslog" "github.com/stretchr/testify/require" "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" + "github.com/prometheus/prometheus/util/teststorage" ) func TestGroup_Equals(t *testing.T) { @@ -248,6 +255,156 @@ func TestGroup_Equals(t *testing.T) { } } +// HTTPStatusOperatorControllableErrorClassifier is a test classifier that identifies +// 429 and 5xx status codes as operator-controllable errors. +type HTTPStatusOperatorControllableErrorClassifier struct{} + +func (*HTTPStatusOperatorControllableErrorClassifier) IsOperatorControllable(err error) bool { + if err == nil { + return false + } + errMsg := err.Error() + return strings.Contains(errMsg, "429") || strings.Contains(errMsg, "50") +} + +func TestEvalOperatorControllableFailures(t *testing.T) { + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + + expr, err := parser.ParseExpr("up") + require.NoError(t, err) + rule := NewRecordingRule("test_rule", expr, labels.EmptyLabels()) + + customClassifier := &HTTPStatusOperatorControllableErrorClassifier{} + + testCases := []struct { + name string + errorMessage string + classifier OperatorControllableErrorClassifier + expectOperatorControllable bool + }{ + {"default classifier", "any error", nil, false}, + {"custom 429 classified as operator controllable", "HTTP 429 Too Many Requests", customClassifier, true}, + {"custom 500 classified as operator controllable", "HTTP 500 Internal Server Error", customClassifier, true}, + {"custom 502 classified as operator controllable", "HTTP 502 Bad Gateway", customClassifier, true}, + {"custom 400 not operator controllable", "HTTP 400 Bad Request", customClassifier, false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + errorQueryFunc := func(_ context.Context, _ string, _ time.Time) (promql.Vector, error) { + return nil, fmt.Errorf("%s", tc.errorMessage) + } + + opts := &ManagerOptions{ + Context: context.Background(), + QueryFunc: errorQueryFunc, + Appendable: storage, + Queryable: storage, + Logger: promslog.NewNopLogger(), + } + + group := NewGroup(GroupOptions{ + Name: "test_group", + File: "test.yml", + Interval: time.Second, + Rules: []Rule{rule}, + Opts: opts, + OperatorControllableErrorClassifier: tc.classifier, + }) + + group.Eval(context.Background(), time.Now()) + + groupKey := GroupKey("test.yml", "test_group") + evalUserFailures := testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "user")) + evalOperatorFailures := testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "operator")) + + if tc.expectOperatorControllable { + require.Equal(t, float64(0), evalUserFailures) + require.Equal(t, float64(1), evalOperatorFailures) + } else { + require.Equal(t, float64(1), evalUserFailures) + require.Equal(t, float64(0), evalOperatorFailures) + } + }) + } +} + +func TestEvalDiscardedSamplesDoNotIncrementFailureMetrics(t *testing.T) { + testCases := []struct { + name string + setupStorage func(storage *teststorage.TestStorage) + offsetMs int64 // milliseconds offset from evaluation time + }{ + { + name: "out of order samples", + setupStorage: func(s *teststorage.TestStorage) { + app := s.Appender(context.Background()) + app.Append(0, labels.FromStrings("__name__", "test_metric", "job", "test"), time.Now().UnixMilli(), 1.0) + app.Commit() + }, + offsetMs: -10000, // 10 seconds in past + }, + { + name: "too old samples", + setupStorage: func(_ *teststorage.TestStorage) {}, + offsetMs: -86400000, // 24 hours in past + }, + { + name: "duplicate samples", + setupStorage: func(s *teststorage.TestStorage) { + app := s.Appender(context.Background()) + app.Append(0, labels.FromStrings("__name__", "test_metric", "job", "test"), time.Now().UnixMilli(), 1.0) + app.Commit() + }, + offsetMs: 0, // Same timestamp, different value + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + + tc.setupStorage(storage) + + expr, err := parser.ParseExpr("up") + require.NoError(t, err) + rule := NewRecordingRule("test_rule", expr, labels.EmptyLabels()) + + queryFunc := func(_ context.Context, _ string, ts time.Time) (promql.Vector, error) { + return promql.Vector{ + promql.Sample{ + Metric: labels.FromStrings("__name__", "test_metric", "job", "test"), + T: ts.UnixMilli() + tc.offsetMs, + F: 2.0, // Different value for duplicate case + }, + }, nil + } + + group := NewGroup(GroupOptions{ + Name: "test_group", + File: "test.yml", + Interval: time.Second, + Rules: []Rule{rule}, + Opts: &ManagerOptions{ + Context: context.Background(), + QueryFunc: queryFunc, + Appendable: storage, + Queryable: storage, + Logger: promslog.NewNopLogger(), + }, + }) + + group.Eval(context.Background(), time.Now()) + + groupKey := GroupKey("test.yml", "test_group") + require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "user"))) + require.Equal(t, float64(0), testutil.ToFloat64(group.metrics.EvalFailures.WithLabelValues(groupKey, "operator"))) + }) + } +} + func pointerOf[T any](value T) *T { return &value } diff --git a/rules/manager.go b/rules/manager.go index 49672a6db..3a42a7e30 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -295,7 +295,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels m.IterationsMissed.DeleteLabelValues(n) m.IterationsScheduled.DeleteLabelValues(n) m.EvalTotal.DeleteLabelValues(n) - m.EvalFailures.DeleteLabelValues(n) + m.EvalFailures.DeleteLabelValues(n, "user") + m.EvalFailures.DeleteLabelValues(n, "operator") m.GroupInterval.DeleteLabelValues(n) m.GroupLastEvalTime.DeleteLabelValues(n) m.GroupLastDuration.DeleteLabelValues(n) diff --git a/rules/manager_test.go b/rules/manager_test.go index 9b0a37821..e5d8a080d 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -1196,11 +1196,11 @@ func TestMetricsUpdate(t *testing.T) { }{ { files: files, - metrics: 12, + metrics: 14, }, { files: files[:1], - metrics: 6, + metrics: 7, }, { files: files[:0], @@ -1208,7 +1208,7 @@ func TestMetricsUpdate(t *testing.T) { }, { files: files[1:], - metrics: 6, + metrics: 7, }, }