diff --git a/go.mod b/go.mod index b1bd2086de0..2c2a1edf2b7 100644 --- a/go.mod +++ b/go.mod @@ -349,7 +349,7 @@ require ( sigs.k8s.io/yaml v1.6.0 // indirect ) -replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v1.8.2-0.20251006025148-b189fcb22335 +replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v1.8.2-0.20251010073941-0ed5c290059e // Replace memberlist with our fork which includes some fixes that haven't been // merged upstream yet: diff --git a/go.sum b/go.sum index 34316cc5820..101b62681d6 100644 --- a/go.sum +++ b/go.sum @@ -573,8 +573,8 @@ github.com/grafana/memberlist v0.3.1-0.20250428154222-f7d51a6f6700 h1:0t7iOQ5ZkB github.com/grafana/memberlist v0.3.1-0.20250428154222-f7d51a6f6700/go.mod h1:Ri9p/tRShbjYnpNf4FFPXG7wxEGY4Nrcn6E7jrVa//4= github.com/grafana/mimir-otlptranslator v0.0.0-20250804202727-9e7a25d056aa h1:MUh6FkgCcJZCXDtEzuuEKTNCc1D0VQbRbqo2A9dr/hI= github.com/grafana/mimir-otlptranslator v0.0.0-20250804202727-9e7a25d056aa/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI= -github.com/grafana/mimir-prometheus v1.8.2-0.20251006025148-b189fcb22335 h1:J2gpG8i0hOWfQn083ysysHOTNkFd8tTWvtn17zQZFJI= -github.com/grafana/mimir-prometheus v1.8.2-0.20251006025148-b189fcb22335/go.mod h1:dp4jsJPZvZIHdvAC9lgpOI5uwqgNzBzOwo/ZU3zUp/c= +github.com/grafana/mimir-prometheus v1.8.2-0.20251010073941-0ed5c290059e h1:D5fyb5Dl9wakxU7REw88pVWD/Tqx3IknZC3TtWNMGcQ= +github.com/grafana/mimir-prometheus v1.8.2-0.20251010073941-0ed5c290059e/go.mod h1:dp4jsJPZvZIHdvAC9lgpOI5uwqgNzBzOwo/ZU3zUp/c= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= github.com/grafana/otel-profiling-go v0.5.1 h1:stVPKAFZSa7eGiqbYuG25VcqYksR6iWvF3YH66t4qL8= diff --git a/vendor/github.com/prometheus/prometheus/rules/group.go b/vendor/github.com/prometheus/prometheus/rules/group.go index 01ec2ecec9b..2d4a713ccc8 100644 --- a/vendor/github.com/prometheus/prometheus/rules/group.go +++ b/vendor/github.com/prometheus/prometheus/rules/group.go @@ -74,6 +74,8 @@ type Group struct { // defaults to DefaultEvalIterationFunc. evalIterationFunc GroupEvalIterationFunc + operatorControllableErrorClassifier OperatorControllableErrorClassifier + appOpts *storage.AppendOptions alignEvaluationTimeOnInterval bool } @@ -85,18 +87,32 @@ type Group struct { // DefaultEvalIterationFunc is the default implementation. type GroupEvalIterationFunc func(ctx context.Context, g *Group, evalTimestamp time.Time) +// OperatorControllableErrorClassifier classifies whether rule evaluation errors are operator-controllable. +type OperatorControllableErrorClassifier interface { + IsOperatorControllable(error) bool +} + type GroupOptions struct { - Name, File string - Interval time.Duration - Limit int - Rules []Rule - SourceTenants []string - ShouldRestore bool - Opts *ManagerOptions - QueryOffset *time.Duration - done chan struct{} - EvalIterationFunc GroupEvalIterationFunc - AlignEvaluationTimeOnInterval bool + Name, File string + Interval time.Duration + Limit int + Rules []Rule + SourceTenants []string + ShouldRestore bool + Opts *ManagerOptions + QueryOffset *time.Duration + done chan struct{} + EvalIterationFunc GroupEvalIterationFunc + AlignEvaluationTimeOnInterval bool + OperatorControllableErrorClassifier OperatorControllableErrorClassifier +} + +// DefaultOperatorControllableErrorClassifier is the default implementation of +// OperatorControllableErrorClassifier that classifies no errors as operator-controllable. +type DefaultOperatorControllableErrorClassifier struct{} + +func (*DefaultOperatorControllableErrorClassifier) IsOperatorControllable(_ error) bool { + return false } // NewGroup makes a new Group with the given name, options, and rules. @@ -114,7 +130,8 @@ func NewGroup(o GroupOptions) *Group { metrics.IterationsMissed.WithLabelValues(key) metrics.IterationsScheduled.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key) - metrics.EvalFailures.WithLabelValues(key) + metrics.EvalFailures.WithLabelValues(key, "user") + metrics.EvalFailures.WithLabelValues(key, "operator") metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key) @@ -127,29 +144,35 @@ func NewGroup(o GroupOptions) *Group { evalIterationFunc = DefaultEvalIterationFunc } + operatorControllableErrorClassifier := o.OperatorControllableErrorClassifier + if operatorControllableErrorClassifier == nil { + operatorControllableErrorClassifier = &DefaultOperatorControllableErrorClassifier{} + } + if opts.Logger == nil { opts.Logger = promslog.NewNopLogger() } return &Group{ - name: o.Name, - file: o.File, - interval: o.Interval, - queryOffset: o.QueryOffset, - limit: o.Limit, - rules: o.Rules, - shouldRestore: o.ShouldRestore, - opts: opts, - sourceTenants: o.SourceTenants, - seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), - done: make(chan struct{}), - managerDone: o.done, - terminated: make(chan struct{}), - logger: opts.Logger.With("file", o.File, "group", o.Name), - metrics: metrics, - evalIterationFunc: evalIterationFunc, - appOpts: &storage.AppendOptions{DiscardOutOfOrder: true}, - alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval, + name: o.Name, + file: o.File, + interval: o.Interval, + queryOffset: o.QueryOffset, + limit: o.Limit, + rules: o.Rules, + shouldRestore: o.ShouldRestore, + opts: opts, + sourceTenants: o.SourceTenants, + seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), + done: make(chan struct{}), + managerDone: o.done, + terminated: make(chan struct{}), + logger: opts.Logger.With("file", o.File, "group", o.Name), + metrics: metrics, + evalIterationFunc: evalIterationFunc, + appOpts: &storage.AppendOptions{DiscardOutOfOrder: true}, + alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval, + operatorControllableErrorClassifier: operatorControllableErrorClassifier, } } @@ -546,7 +569,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + g.incrementEvalFailures(err) // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. @@ -576,7 +599,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + g.incrementEvalFailures(err) logger.Warn("Rule sample appending failed", "err", err) return @@ -701,6 +724,14 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { g.cleanupStaleSeries(ctx, ts) } +func (g *Group) incrementEvalFailures(err error) { + reason := "user" + if g.operatorControllableErrorClassifier != nil && g.operatorControllableErrorClassifier.IsOperatorControllable(err) { + reason = "operator" + } + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), reason).Inc() +} + func (g *Group) QueryOffset() time.Duration { if g.queryOffset != nil { return *g.queryOffset @@ -1010,7 +1041,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_evaluation_failures_total", Help: "The total number of rule evaluation failures.", }, - []string{"rule_group"}, + []string{"rule_group", "reason"}, ), GroupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{ diff --git a/vendor/github.com/prometheus/prometheus/rules/manager.go b/vendor/github.com/prometheus/prometheus/rules/manager.go index 49672a6db7e..3a42a7e30a2 100644 --- a/vendor/github.com/prometheus/prometheus/rules/manager.go +++ b/vendor/github.com/prometheus/prometheus/rules/manager.go @@ -295,7 +295,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels m.IterationsMissed.DeleteLabelValues(n) m.IterationsScheduled.DeleteLabelValues(n) m.EvalTotal.DeleteLabelValues(n) - m.EvalFailures.DeleteLabelValues(n) + m.EvalFailures.DeleteLabelValues(n, "user") + m.EvalFailures.DeleteLabelValues(n, "operator") m.GroupInterval.DeleteLabelValues(n) m.GroupLastEvalTime.DeleteLabelValues(n) m.GroupLastDuration.DeleteLabelValues(n) diff --git a/vendor/modules.txt b/vendor/modules.txt index 9b7af621c3d..0b3f7e81274 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1243,7 +1243,7 @@ github.com/prometheus/otlptranslator github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util -# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v1.8.2-0.20251006025148-b189fcb22335 +# github.com/prometheus/prometheus v1.99.0 => github.com/grafana/mimir-prometheus v1.8.2-0.20251010073941-0ed5c290059e ## explicit; go 1.24.0 github.com/prometheus/prometheus/config github.com/prometheus/prometheus/discovery @@ -2191,7 +2191,7 @@ sigs.k8s.io/kustomize/kyaml/yaml/walk sigs.k8s.io/yaml sigs.k8s.io/yaml/goyaml.v2 sigs.k8s.io/yaml/goyaml.v3 -# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v1.8.2-0.20251006025148-b189fcb22335 +# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v1.8.2-0.20251010073941-0ed5c290059e # github.com/hashicorp/memberlist => github.com/grafana/memberlist v0.3.1-0.20250428154222-f7d51a6f6700 # gopkg.in/yaml.v3 => github.com/colega/go-yaml-yaml v0.0.0-20220720105220-255a8d16d094 # github.com/grafana/regexp => github.com/grafana/regexp v0.0.0-20250905101755-5eb4f3acbf71