Skip to content

Commit e3bc55d

Browse files
committed
feat(notify): make mute stage honor send_resolved
Make mute stage honor `send_resolved` in receivers. This fixes cases where an alert was already sent to a receiver before being silenced. Sending resolved notifications for a now silenced alert helps update the receiver's status. For example an opened PagerDuty incident will be closed. Signed-off-by: Siavash Safi <[email protected]>
1 parent f2fbb31 commit e3bc55d

File tree

2 files changed

+334
-2
lines changed

2 files changed

+334
-2
lines changed

notify/notify.go

Lines changed: 130 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,17 +541,37 @@ const (
541541

542542
// MuteStage filters alerts through a Muter.
543543
type MuteStage struct {
544-
muter types.Muter
545-
metrics *Metrics
544+
muter types.Muter
545+
metrics *Metrics
546+
notificationLog NotificationLog
547+
receivers map[string][]Integration
546548
}
547549

548550
// NewMuteStage return a new MuteStage.
549551
func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
550552
return &MuteStage{muter: m, metrics: metrics}
551553
}
552554

555+
// NewMuteStageWithSendResolved returns a new MuteStage that honors send_resolved
556+
// for silenced alerts. This should be used for silence.Silencer to allow resolved
557+
// notifications for previously notified alerts.
558+
func NewMuteStageWithSendResolved(m types.Muter, notificationLog NotificationLog, receivers map[string][]Integration, metrics *Metrics) *MuteStage {
559+
return &MuteStage{
560+
muter: m,
561+
metrics: metrics,
562+
notificationLog: notificationLog,
563+
receivers: receivers,
564+
}
565+
}
566+
553567
// Exec implements the Stage interface.
554568
func (n *MuteStage) Exec(ctx context.Context, logger *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
569+
// If this is a silencer with send_resolved support, use the enhanced logic
570+
if _, isSilencer := n.muter.(*silence.Silencer); isSilencer && n.notificationLog != nil && n.receivers != nil {
571+
return n.execWithSendResolved(ctx, logger, alerts...)
572+
}
573+
574+
// Standard muting logic for inhibitors or silencers without send_resolved support
555575
var (
556576
filtered []*types.Alert
557577
muted []*types.Alert
@@ -583,6 +603,114 @@ func (n *MuteStage) Exec(ctx context.Context, logger *slog.Logger, alerts ...*ty
583603
return ctx, filtered, nil
584604
}
585605

606+
// execWithSendResolved implements enhanced silence filtering that honors send_resolved config.
607+
// It allows resolved notifications for silenced alerts if the alert was previously notified
608+
// and the receiver has send_resolved enabled.
609+
func (n *MuteStage) execWithSendResolved(ctx context.Context, logger *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
610+
var (
611+
filtered []*types.Alert
612+
muted []*types.Alert
613+
)
614+
615+
// Get the receiver name from context
616+
receiverName, ok := ReceiverName(ctx)
617+
if !ok {
618+
return ctx, nil, errors.New("receiver name missing")
619+
}
620+
621+
// Get the group key from context
622+
groupKey, ok := GroupKey(ctx)
623+
if !ok {
624+
return ctx, nil, errors.New("group key missing")
625+
}
626+
627+
// Check if any integration for this receiver has send_resolved enabled
628+
receiverIntegrations, receiverExists := n.receivers[receiverName]
629+
if !receiverExists {
630+
// Receiver not found, fall back to standard behavior
631+
for _, a := range alerts {
632+
if n.muter.Mutes(a.Labels) {
633+
muted = append(muted, a)
634+
} else {
635+
filtered = append(filtered, a)
636+
}
637+
}
638+
if len(muted) > 0 {
639+
n.metrics.numNotificationSuppressedTotal.WithLabelValues(SuppressedReasonSilence).Add(float64(len(muted)))
640+
logger.Debug("Notifications will not be sent for silenced alerts", "alerts", fmt.Sprintf("%v", muted), "reason", SuppressedReasonSilence)
641+
}
642+
return ctx, filtered, nil
643+
}
644+
645+
// Check if any integration has send_resolved enabled
646+
hasSendResolved := false
647+
for _, integration := range receiverIntegrations {
648+
if integration.SendResolved() {
649+
hasSendResolved = true
650+
break
651+
}
652+
}
653+
654+
// Process each alert
655+
for _, a := range alerts {
656+
if !n.muter.Mutes(a.Labels) {
657+
// Alert is not silenced, let it through
658+
filtered = append(filtered, a)
659+
continue
660+
}
661+
662+
// Alert is silenced
663+
if !a.Resolved() || !hasSendResolved {
664+
// Alert is firing or receiver doesn't have send_resolved, filter it out
665+
muted = append(muted, a)
666+
continue
667+
}
668+
669+
// Alert is resolved and receiver has send_resolved enabled
670+
// Check if this alert was previously notified
671+
wasNotified := false
672+
for _, integration := range receiverIntegrations {
673+
if !integration.SendResolved() {
674+
continue
675+
}
676+
677+
recv := &nflogpb.Receiver{
678+
GroupName: receiverName,
679+
Integration: integration.Name(),
680+
Idx: uint32(integration.Index()),
681+
}
682+
683+
entries, err := n.notificationLog.Query(nflog.QGroupKey(groupKey), nflog.QReceiver(recv))
684+
if err != nil && !errors.Is(err, nflog.ErrNotFound) {
685+
logger.Warn("Failed to query notification log for silenced resolved alert", "alert", a.Name(), "err", err)
686+
continue
687+
}
688+
689+
if len(entries) > 0 && len(entries[0].FiringAlerts) > 0 {
690+
// This alert was previously notified as firing for this integration
691+
wasNotified = true
692+
break
693+
}
694+
}
695+
696+
if wasNotified {
697+
// Alert was previously notified and is now resolved, let it through for resolved notification
698+
filtered = append(filtered, a)
699+
logger.Debug("Allowing resolved notification for silenced alert", "alert", a.Name())
700+
} else {
701+
// Alert was not previously notified, filter it out
702+
muted = append(muted, a)
703+
}
704+
}
705+
706+
if len(muted) > 0 {
707+
n.metrics.numNotificationSuppressedTotal.WithLabelValues(SuppressedReasonSilence).Add(float64(len(muted)))
708+
logger.Debug("Notifications will not be sent for silenced alerts", "alerts", fmt.Sprintf("%v", muted), "reason", SuppressedReasonSilence)
709+
}
710+
711+
return ctx, filtered, nil
712+
}
713+
586714
// WaitStage waits for a certain amount of time before continuing or until the
587715
// context is done.
588716
type WaitStage struct {

notify/notify_test.go

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -823,6 +823,204 @@ func TestMuteStageWithSilences(t *testing.T) {
823823
}
824824
}
825825

826+
func TestMuteStageWithSendResolved(t *testing.T) {
827+
// Create silences
828+
silences, err := silence.New(silence.Options{Retention: time.Hour})
829+
require.NoError(t, err)
830+
831+
sil := &silencepb.Silence{
832+
EndsAt: utcNow().Add(time.Hour),
833+
Matchers: []*silencepb.Matcher{{Name: "mute", Pattern: "me"}},
834+
}
835+
require.NoError(t, silences.Set(sil))
836+
837+
reg := prometheus.NewRegistry()
838+
marker := types.NewMarker(reg)
839+
silencer := silence.NewSilencer(silences, marker, promslog.NewNopLogger())
840+
841+
// Create notification log
842+
nflog, err := nflog.New(nflog.Options{
843+
Retention: time.Hour,
844+
Logger: promslog.NewNopLogger(),
845+
Metrics: reg,
846+
})
847+
require.NoError(t, err)
848+
849+
// Create receivers with different send_resolved settings
850+
receiversWithSendResolved := map[string][]Integration{
851+
"with_resolved": {
852+
NewIntegration(&testNotifier{}, sendResolved(true), "webhook", 0, "with_resolved"),
853+
},
854+
"without_resolved": {
855+
NewIntegration(&testNotifier{}, sendResolved(false), "webhook", 0, "without_resolved"),
856+
},
857+
"mixed": {
858+
NewIntegration(&testNotifier{}, sendResolved(true), "webhook", 0, "mixed"),
859+
NewIntegration(&testNotifier{}, sendResolved(false), "email", 1, "mixed"),
860+
},
861+
}
862+
863+
metrics := NewMetrics(reg, featurecontrol.NoopFlags{})
864+
stage := NewMuteStageWithSendResolved(silencer, nflog, receiversWithSendResolved, metrics)
865+
866+
// Test 1: Silenced firing alerts are filtered out
867+
t.Run("silenced firing alerts are filtered", func(t *testing.T) {
868+
firingAlert := &types.Alert{
869+
Alert: model.Alert{
870+
Labels: model.LabelSet{"mute": "me", "alertname": "test"},
871+
EndsAt: time.Time{}, // Firing alert has zero EndsAt
872+
StartsAt: time.Now().Add(-time.Hour),
873+
},
874+
}
875+
876+
ctx := context.Background()
877+
ctx = WithReceiverName(ctx, "with_resolved")
878+
ctx = WithGroupKey(ctx, "test-group")
879+
880+
_, alerts, err := stage.Exec(ctx, promslog.NewNopLogger(), firingAlert)
881+
require.NoError(t, err)
882+
require.Empty(t, alerts, "firing silenced alerts should be filtered")
883+
})
884+
885+
// Test 2: Silenced resolved alerts without prior notification are filtered
886+
t.Run("silenced resolved alerts without prior notification are filtered", func(t *testing.T) {
887+
resolvedAlert := &types.Alert{
888+
Alert: model.Alert{
889+
Labels: model.LabelSet{"mute": "me", "alertname": "test2"},
890+
StartsAt: time.Now().Add(-2 * time.Hour),
891+
EndsAt: time.Now().Add(-time.Hour),
892+
},
893+
}
894+
895+
ctx := context.Background()
896+
ctx = WithReceiverName(ctx, "with_resolved")
897+
ctx = WithGroupKey(ctx, "test-group-2")
898+
899+
_, alerts, err := stage.Exec(ctx, promslog.NewNopLogger(), resolvedAlert)
900+
require.NoError(t, err)
901+
require.Empty(t, alerts, "resolved silenced alerts without prior notification should be filtered")
902+
})
903+
904+
// Test 3: Silenced resolved alerts with prior notification and send_resolved=true are allowed
905+
t.Run("silenced resolved alerts with prior notification and send_resolved are allowed", func(t *testing.T) {
906+
resolvedAlert := &types.Alert{
907+
Alert: model.Alert{
908+
Labels: model.LabelSet{"mute": "me", "alertname": "test3"},
909+
StartsAt: time.Now().Add(-2 * time.Hour),
910+
EndsAt: time.Now().Add(-time.Hour),
911+
},
912+
}
913+
914+
groupKey := "test-group-3"
915+
receiverName := "with_resolved"
916+
917+
// Log a previous notification for this alert
918+
recv := &nflogpb.Receiver{
919+
GroupName: receiverName,
920+
Integration: "webhook",
921+
Idx: 0,
922+
}
923+
err := nflog.Log(recv, groupKey, []uint64{1234}, []uint64{}, time.Hour)
924+
require.NoError(t, err)
925+
926+
ctx := context.Background()
927+
ctx = WithReceiverName(ctx, receiverName)
928+
ctx = WithGroupKey(ctx, groupKey)
929+
930+
_, alerts, err := stage.Exec(ctx, promslog.NewNopLogger(), resolvedAlert)
931+
require.NoError(t, err)
932+
require.Len(t, alerts, 1, "resolved silenced alerts with prior notification should be allowed when send_resolved=true")
933+
})
934+
935+
// Test 4: Silenced resolved alerts with send_resolved=false are filtered even with prior notification
936+
t.Run("silenced resolved alerts without send_resolved are filtered", func(t *testing.T) {
937+
resolvedAlert := &types.Alert{
938+
Alert: model.Alert{
939+
Labels: model.LabelSet{"mute": "me", "alertname": "test4"},
940+
StartsAt: time.Now().Add(-2 * time.Hour),
941+
EndsAt: time.Now().Add(-time.Hour),
942+
},
943+
}
944+
945+
groupKey := "test-group-4"
946+
receiverName := "without_resolved"
947+
948+
// Log a previous notification for this alert
949+
recv := &nflogpb.Receiver{
950+
GroupName: receiverName,
951+
Integration: "webhook",
952+
Idx: 0,
953+
}
954+
err := nflog.Log(recv, groupKey, []uint64{5678}, []uint64{}, time.Hour)
955+
require.NoError(t, err)
956+
957+
ctx := context.Background()
958+
ctx = WithReceiverName(ctx, receiverName)
959+
ctx = WithGroupKey(ctx, groupKey)
960+
961+
_, alerts, err := stage.Exec(ctx, promslog.NewNopLogger(), resolvedAlert)
962+
require.NoError(t, err)
963+
require.Empty(t, alerts, "resolved silenced alerts should be filtered when send_resolved=false")
964+
})
965+
966+
// Test 5: Non-silenced alerts pass through
967+
t.Run("non-silenced alerts pass through", func(t *testing.T) {
968+
firingAlert := &types.Alert{
969+
Alert: model.Alert{
970+
Labels: model.LabelSet{"not": "muted", "alertname": "test5"},
971+
EndsAt: time.Time{},
972+
StartsAt: time.Now().Add(-time.Hour),
973+
},
974+
}
975+
resolvedAlert := &types.Alert{
976+
Alert: model.Alert{
977+
Labels: model.LabelSet{"not": "muted", "alertname": "test6"},
978+
StartsAt: time.Now().Add(-2 * time.Hour),
979+
EndsAt: time.Now().Add(-time.Hour),
980+
},
981+
}
982+
983+
ctx := context.Background()
984+
ctx = WithReceiverName(ctx, "with_resolved")
985+
ctx = WithGroupKey(ctx, "test-group-5")
986+
987+
_, alerts, err := stage.Exec(ctx, promslog.NewNopLogger(), firingAlert, resolvedAlert)
988+
require.NoError(t, err)
989+
require.Len(t, alerts, 2, "non-silenced alerts should pass through")
990+
})
991+
992+
// Test 6: Mixed receiver with at least one send_resolved=true integration
993+
t.Run("mixed receiver with send_resolved allows resolved silenced alerts", func(t *testing.T) {
994+
resolvedAlert := &types.Alert{
995+
Alert: model.Alert{
996+
Labels: model.LabelSet{"mute": "me", "alertname": "test7"},
997+
StartsAt: time.Now().Add(-2 * time.Hour),
998+
EndsAt: time.Now().Add(-time.Hour),
999+
},
1000+
}
1001+
1002+
groupKey := "test-group-7"
1003+
receiverName := "mixed"
1004+
1005+
// Log a previous notification for the webhook integration (send_resolved=true)
1006+
recv := &nflogpb.Receiver{
1007+
GroupName: receiverName,
1008+
Integration: "webhook",
1009+
Idx: 0,
1010+
}
1011+
err := nflog.Log(recv, groupKey, []uint64{9999}, []uint64{}, time.Hour)
1012+
require.NoError(t, err)
1013+
1014+
ctx := context.Background()
1015+
ctx = WithReceiverName(ctx, receiverName)
1016+
ctx = WithGroupKey(ctx, groupKey)
1017+
1018+
_, alerts, err := stage.Exec(ctx, promslog.NewNopLogger(), resolvedAlert)
1019+
require.NoError(t, err)
1020+
require.Len(t, alerts, 1, "resolved silenced alerts should be allowed when at least one integration has send_resolved=true")
1021+
})
1022+
}
1023+
8261024
func TestTimeMuteStage(t *testing.T) {
8271025
sydney, err := time.LoadLocation("Australia/Sydney")
8281026
if err != nil {
@@ -1080,3 +1278,9 @@ func BenchmarkHashAlert(b *testing.B) {
10801278
hashAlert(alert)
10811279
}
10821280
}
1281+
1282+
type testNotifier struct{}
1283+
1284+
func (n *testNotifier) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) {
1285+
return true, nil
1286+
}

0 commit comments

Comments
 (0)