Skip to content

Commit f2fbb31

Browse files
authored
feat(inhibit): add inhibition metrics (#4629)
Add metrics for inhibitor: - alertmanager_inhibitor_source_alerts_cache_items - alertmanager_inhibitor_source_alerts_index_items - alertmanager_inhibitor_mutes_duration_seconds Add metrics for inhibition rules: - alertmanager_inhibit_rule_source_alerts_cache_items - alertmanager_inhibit_rule_source_alerts_index_items - alertmanager_inhibit_rule_matches_duration_seconds - alertmanager_inhibit_rule_mutes_duration_seconds Other changes: - Add debug logs for duplicate inhibition rule names - Add Len() method to store.Alerts struct - Add Len() method to inhibit.index struct - update docs Signed-off-by: Siavash Safi <[email protected]>
1 parent 1f2df03 commit f2fbb31

File tree

10 files changed

+713
-24
lines changed

10 files changed

+713
-24
lines changed

cmd/alertmanager/main.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ var (
104104
prometheus.GaugeOpts{
105105
Name: "alertmanager_inhibition_rules",
106106
Help: "Number of configured inhibition rules.",
107-
})
107+
},
108+
)
109+
108110
promslogConfig = promslog.Config{}
109111
)
110112

@@ -408,6 +410,7 @@ func run() int {
408410
)
409411

410412
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer)
413+
inhibitMetrics := inhibit.NewInhibitorMetrics(prometheus.DefaultRegisterer)
411414
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer, ff)
412415
configLogger := logger.With("component", "configuration")
413416
configCoordinator := config.NewCoordinator(
@@ -462,7 +465,7 @@ func run() int {
462465
inhibitor.Stop()
463466
disp.Stop()
464467

465-
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
468+
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger, inhibitMetrics)
466469
silencer := silence.NewSilencer(silences, marker, logger)
467470

468471
// An interface value that holds a nil concrete value is non-nil.

docs/configuration.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,7 @@ to reason about and does not trigger this special case.
442442

443443
```yaml
444444
# Optional name of the inhibition rule.
445+
# Duplicate names are allowed but will affect the per-rule metrics.
445446
name: <string>
446447
447448
# DEPRECATED: Use target_matchers below.

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ require (
3535
github.com/oklog/run v1.2.0
3636
github.com/oklog/ulid v1.3.1
3737
github.com/prometheus/client_golang v1.23.2
38+
github.com/prometheus/client_model v0.6.2
3839
github.com/prometheus/common v0.67.1
3940
github.com/prometheus/exporter-toolkit v0.14.1
4041
github.com/prometheus/sigv4 v0.2.1
@@ -104,7 +105,6 @@ require (
104105
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
105106
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
106107
github.com/pmezard/go-difflib v1.0.0 // indirect
107-
github.com/prometheus/client_model v0.6.2 // indirect
108108
github.com/prometheus/procfs v0.16.1 // indirect
109109
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
110110
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect

inhibit/index.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,10 @@ func (c *index) Delete(key model.Fingerprint) {
5555

5656
delete(c.items, key)
5757
}
58+
59+
func (c *index) Len() int {
60+
c.mtx.RLock()
61+
defer c.mtx.RUnlock()
62+
63+
return len(c.items)
64+
}

inhibit/inhibit.go

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"time"
2121

2222
"github.com/oklog/run"
23+
"github.com/prometheus/client_golang/prometheus"
2324
"github.com/prometheus/common/model"
2425

2526
"github.com/prometheus/alertmanager/config"
@@ -33,26 +34,39 @@ import (
3334
// currently active alerts and a set of inhibition rules. It implements the
3435
// Muter interface.
3536
type Inhibitor struct {
36-
alerts provider.Alerts
37-
rules []*InhibitRule
38-
marker types.AlertMarker
39-
logger *slog.Logger
37+
alerts provider.Alerts
38+
rules []*InhibitRule
39+
marker types.AlertMarker
40+
logger *slog.Logger
41+
metrics *InhibitorMetrics
4042

4143
mtx sync.RWMutex
4244
cancel func()
4345
}
4446

4547
// NewInhibitor returns a new Inhibitor.
46-
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger) *Inhibitor {
48+
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger, metrics *InhibitorMetrics) *Inhibitor {
4749
ih := &Inhibitor{
48-
alerts: ap,
49-
marker: mk,
50-
logger: logger,
50+
alerts: ap,
51+
marker: mk,
52+
logger: logger,
53+
metrics: metrics,
5154
}
52-
for _, cr := range rs {
53-
r := NewInhibitRule(cr)
55+
56+
ruleNames := make(map[string]struct{})
57+
for i, cr := range rs {
58+
if _, ok := ruleNames[cr.Name]; ok {
59+
ih.logger.Debug("duplicate inhibition rule name", "index", i, "name", cr.Name)
60+
}
61+
62+
r := NewInhibitRule(cr, NewRuleMetrics(cr.Name, metrics))
5463
ih.rules = append(ih.rules, r)
64+
65+
if cr.Name != "" {
66+
ruleNames[cr.Name] = struct{}{}
67+
}
5568
}
69+
5670
return ih
5771
}
5872

@@ -70,16 +84,30 @@ func (ih *Inhibitor) run(ctx context.Context) {
7084
continue
7185
}
7286
// Update the inhibition rules' cache.
87+
cachedSum := 0
88+
indexedSum := 0
7389
for _, r := range ih.rules {
7490
if r.SourceMatchers.Matches(a.Labels) {
7591
if err := r.scache.Set(a); err != nil {
7692
ih.logger.Error("error on set alert", "err", err)
7793
continue
7894
}
79-
8095
r.updateIndex(a)
96+
97+
cached := r.scache.Len()
98+
indexed := r.sindex.Len()
99+
100+
if r.Name != "" {
101+
r.metrics.sourceAlertsCacheItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(cached))
102+
r.metrics.sourceAlertsIndexItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(indexed))
103+
}
104+
105+
cachedSum += cached
106+
indexedSum += indexed
81107
}
82108
}
109+
ih.metrics.sourceAlertsCacheItems.Set(float64(cachedSum))
110+
ih.metrics.sourceAlertsIndexItems.Set(float64(indexedSum))
83111
}
84112
}
85113
}
@@ -128,21 +156,29 @@ func (ih *Inhibitor) Stop() {
128156
// Mutes returns true iff the given label set is muted. It implements the Muter
129157
// interface.
130158
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
159+
start := time.Now()
131160
fp := lset.Fingerprint()
132161

133162
for _, r := range ih.rules {
163+
ruleStart := time.Now()
134164
if !r.TargetMatchers.Matches(lset) {
135165
// If target side of rule doesn't match, we don't need to look any further.
166+
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "false"}).Observe(time.Since(ruleStart).Seconds())
136167
continue
137168
}
169+
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "true"}).Observe(time.Since(ruleStart).Seconds())
138170
// If we are here, the target side matches. If the source side matches, too, we
139171
// need to exclude inhibiting alerts for which the same is true.
140172
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset)); eq {
141173
ih.marker.SetInhibited(fp, inhibitedByFP.String())
174+
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "true"}).Observe(time.Since(start).Seconds())
175+
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "true"}).Observe(time.Since(ruleStart).Seconds())
142176
return true
143177
}
178+
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "false"}).Observe(time.Since(ruleStart).Seconds())
144179
}
145180
ih.marker.SetInhibited(fp)
181+
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "false"}).Observe(time.Since(start).Seconds())
146182

147183
return false
148184
}
@@ -173,14 +209,17 @@ type InhibitRule struct {
173209
// The index items might overwrite eachother if multiple source alerts have exact equal labels.
174210
// Overwrites only happen if the new source alert has bigger EndsAt value.
175211
sindex *index
212+
213+
metrics *RuleMetrics
176214
}
177215

178216
// NewInhibitRule returns a new InhibitRule based on a configuration definition.
179-
func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
217+
func NewInhibitRule(cr config.InhibitRule, metrics *RuleMetrics) *InhibitRule {
180218
var (
181219
sourcem labels.Matchers
182220
targetm labels.Matchers
183221
)
222+
184223
// cr.SourceMatch will be deprecated. This for loop appends regex matchers.
185224
for ln, lv := range cr.SourceMatch {
186225
matcher, err := labels.NewMatcher(labels.MatchEqual, ln, lv)
@@ -235,6 +274,7 @@ func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
235274
Equal: equal,
236275
scache: store.NewAlerts(),
237276
sindex: newIndex(),
277+
metrics: metrics,
238278
}
239279

240280
rule.scache.SetGCCallback(rule.gcCallback)
@@ -310,6 +350,10 @@ func (r *InhibitRule) gcCallback(alerts []types.Alert) {
310350
fp := r.fingerprintEquals(a.Labels)
311351
r.sindex.Delete(fp)
312352
}
353+
if r.Name != "" {
354+
r.metrics.sourceAlertsCacheItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.scache.Len()))
355+
r.metrics.sourceAlertsIndexItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.sindex.Len()))
356+
}
313357
}
314358

315359
// hasEqual checks whether the source cache contains alerts matching the equal

inhibit/inhibit_bench_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ func benchmarkMutes(b *testing.B, opts benchmarkOptions) {
198198
}
199199
}
200200

201-
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger())
201+
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger(), NewInhibitorMetrics(r))
202202
defer ih.Stop()
203203
go ih.Run()
204204

inhibit/inhibit_test.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,10 @@ func TestInhibitRuleHasEqual(t *testing.T) {
125125
for _, c := range cases {
126126
t.Run(c.name, func(t *testing.T) {
127127
r := &InhibitRule{
128-
Equal: map[model.LabelName]struct{}{},
129-
scache: store.NewAlerts(),
130-
sindex: newIndex(),
128+
Equal: map[model.LabelName]struct{}{},
129+
scache: store.NewAlerts(),
130+
sindex: newIndex(),
131+
metrics: NewRuleMetrics("test", NewInhibitorMetrics(prometheus.NewRegistry())),
131132
}
132133
for _, ln := range c.equal {
133134
r.Equal[ln] = struct{}{}
@@ -159,7 +160,7 @@ func TestInhibitRuleMatches(t *testing.T) {
159160
}
160161

161162
m := types.NewMarker(prometheus.NewRegistry())
162-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
163+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
163164
now := time.Now()
164165
// Active alert that matches the source filter of rule1.
165166
sourceAlert1 := &types.Alert{
@@ -260,7 +261,7 @@ func TestInhibitRuleMatchers(t *testing.T) {
260261
}
261262

262263
m := types.NewMarker(prometheus.NewRegistry())
263-
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
264+
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
264265
now := time.Now()
265266
// Active alert that matches the source filter of rule1.
266267
sourceAlert1 := &types.Alert{
@@ -369,8 +370,8 @@ func TestInhibitRuleName(t *testing.T) {
369370
Equal: []string{"instance"},
370371
}
371372

372-
rule1 := NewInhibitRule(config1)
373-
rule2 := NewInhibitRule(config2)
373+
rule1 := NewInhibitRule(config1, nil)
374+
rule2 := NewInhibitRule(config2, nil)
374375

375376
require.Equal(t, "test-rule", rule1.Name, "Expected named rule to have adopt name from config")
376377
require.Empty(t, rule2.Name, "Expected unnamed rule to have empty name")
@@ -498,7 +499,7 @@ func TestInhibit(t *testing.T) {
498499
} {
499500
ap := newFakeAlerts(tc.alerts)
500501
mk := types.NewMarker(prometheus.NewRegistry())
501-
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger)
502+
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
502503

503504
go func() {
504505
for ap.finished != nil {

0 commit comments

Comments
 (0)