Skip to content

Commit b384297

Browse files
committed
19417 FIX Piggyback-hub: Fix crash when many piggyback hosts are updated at once
Handle event queue overflow occurred by rescanning for messages and delete watchees of deleted host folders to free up watch descriptors. SUP-28282 Change-Id: I52b6844c317e791ed60f634f01343314f2d3e8db
1 parent 37bda6f commit b384297

5 files changed

Lines changed: 105 additions & 7 deletions

File tree

.werks/19417.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
[//]: # (werk v2)
2+
# Piggyback Hub: Fix crash when many piggyback hosts are updated at once
3+
4+
key | value
5+
---------- | ---
6+
date | 2026-04-07T16:47:36+00:00
7+
version | 2.5.0b5
8+
class | fix
9+
edition | cre
10+
component | checks
11+
level | 1
12+
compatible | yes
13+
14+
On sites with a large number of piggybacked hosts, the piggyback hub process
15+
could crash when many of them were updated at the same time.
16+
The hub now recovers gracefully: Any updates that were missed during the burst
17+
are detected and reprocessed automatically.
18+
19+
If you see the following message in the piggyback hub logs under
20+
`var/log/piggyback-hub.log`, it means that the recovery path was triggered:
21+
```
22+
Too many messages for the piggyback-hub to progress at once, rescanning data. Consider raising /proc/sys/fs/inotify/max_queued_events.
23+
```
24+
In systems where a large number of piggybacked hosts are updated at the same
25+
time, the `fs.inotify.max_queued_events` limit should be raised to reduce how
26+
often the recovery path is triggered.
27+
The value should be at least the number of piggyback hosts on the site, ideally
28+
at least double that number to account for bursts.
29+
To do so, run:
30+
```
31+
sysctl -w fs.inotify.max_queued_events=65536
32+
```
33+
Additionally, the piggyback hub registers an inotify watch for each piggybacked
34+
host directory. If `fs.inotify.max_user_watches` is lower than the number of
35+
piggybacked hosts, the hub may fail to monitor all of them. Check the current
36+
value with:
37+
```
38+
sysctl fs.inotify.max_user_watches
39+
```
40+
and raise it if necessary:
41+
```
42+
sysctl -w fs.inotify.max_user_watches=65536
43+
```
44+
To make these changes persist across reboots, add the respective lines to
45+
`/etc/sysctl.conf` (or a file in `/etc/sysctl.d/`):
46+
```
47+
fs.inotify.max_queued_events = 65536
48+
fs.inotify.max_user_watches = 65536
49+
```
50+

cmk/piggyback/backend/_inotify.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# mypy: disable-error-code="no-untyped-call"
1717

1818
import enum
19+
import errno
1920
import os
2021
from collections.abc import Iterator, Sequence
2122
from ctypes import c_int, CDLL, get_errno
@@ -111,6 +112,10 @@ def iterate_parsed_events(self, data: bytes) -> Iterator[Event]:
111112
].split(b"\x00", 1)[0]
112113
offset = offset + self._FIXED_EVENT_PART_LEN + bytes_remaining
113114

115+
if raw_event_type & Masks.Q_OVERFLOW:
116+
yield Event(Watchee(-1, Path()), Masks(raw_event_type), Cookie(0), "")
117+
continue
118+
114119
yield Event(
115120
Watchee(int(raw_watch_descriptor), self._wd_map[raw_watch_descriptor]),
116121
Masks(raw_event_type),
@@ -183,7 +188,13 @@ def add_watch(self, path: Path, mask: Masks) -> Watchee:
183188
return Watchee(watch_descriptor, path)
184189

185190
def rm_watch(self, watchee: Watchee) -> None:
186-
self._libc.rm_watch(self._fileio.fileno(), watchee.wd)
191+
try:
192+
self._libc.rm_watch(self._fileio.fileno(), watchee.wd)
193+
except OSError as e:
194+
# EINVAL means the watch was already removed (e.g. the watched file/directory
195+
# was deleted), so we can safely ignore it. Re-raise any other OS error.
196+
if e.errno != errno.EINVAL:
197+
raise
187198
self._parser.drop(watchee.wd)
188199

189200
def read_forever(self) -> Iterator[Event]:

cmk/piggyback/backend/_storage.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,21 +90,42 @@ class PiggybackMessage:
9090
def watch_new_messages(omd_root: Path) -> Iterator[PiggybackMessage]:
9191
"""Yields piggyback messages as they come in."""
9292

93+
host_folder_mask = Masks.MOVED_TO | Masks.DELETE_SELF
94+
9395
with INotify() as inotify:
9496
watch_for_new_piggybacked_hosts = inotify.add_watch(payload_dir(omd_root), Masks.CREATE)
9597
watch_for_deleted_status_files = inotify.add_watch(
9698
source_status_dir(omd_root), Masks.DELETE
9799
)
98100
for folder in _get_piggybacked_host_folders(omd_root):
99-
inotify.add_watch(folder, Masks.MOVED_TO)
101+
inotify.add_watch(folder, host_folder_mask)
102+
103+
_last_processed_time: int = int(time.time())
100104

101105
for event in inotify.read_forever():
106+
if event.type & Masks.Q_OVERFLOW:
107+
logger.warning(
108+
"Too many messages for the piggyback-hub to progress at once, rescanning data. "
109+
"Consider raising /proc/sys/fs/inotify/max_queued_events."
110+
)
111+
# check if any data was missed when the event queue overflowed
112+
for source_file in _get_source_state_files(omd_root):
113+
if (
114+
mtime := _get_mtime(source_file)
115+
) is not None and mtime >= _last_processed_time:
116+
source = HostName(source_file.name)
117+
for piggybacked_host in _get_piggybacked_hosts_for_source(omd_root, source):
118+
yield from get_messages_for(HostAddress(piggybacked_host), omd_root)
119+
_last_processed_time = int(time.time())
120+
continue
121+
102122
# check if a new piggybacked host folder was created
103123
if event.watchee == watch_for_new_piggybacked_hosts:
104124
if event.type & Masks.CREATE:
105-
inotify.add_watch(event.watchee.path / event.name, Masks.MOVED_TO)
125+
inotify.add_watch(event.watchee.path / event.name, host_folder_mask)
106126
# Handle all files already in the folder (we rather have duplicates than missing files)
107127
yield from get_messages_for(HostAddress(event.name), omd_root)
128+
_last_processed_time = int(time.time())
108129
continue
109130
if event.watchee == watch_for_deleted_status_files:
110131
if event.type & Masks.DELETE:
@@ -119,9 +140,15 @@ def watch_new_messages(omd_root: Path) -> Iterator[PiggybackMessage]:
119140
),
120141
b"",
121142
)
143+
_last_processed_time = int(time.time())
144+
continue
145+
146+
if event.type & Masks.DELETE_SELF:
147+
inotify.rm_watch(event.watchee)
122148
continue
123149

124150
if message := _make_message_from_event(event, omd_root):
151+
_last_processed_time = int(time.time())
125152
yield message
126153

127154

cmk/piggyback/hub/_main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def _parse_arguments(argv: list[str]) -> Arguments:
101101

102102

103103
def _setup_logging(args: Arguments) -> logging.Logger:
104-
logger = getLogger(__name__)
104+
logger = getLogger("cmk.piggyback")
105105
handler: logging.StreamHandler | WatchedFileHandler = (
106106
logging.StreamHandler(stream=sys.stderr)
107107
if args.foreground

tests/unit/cmk/piggyback/test__inotify.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33
# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and
44
# conditions defined in the file COPYING, which is part of this source code package.
55

6-
# mypy: disable-error-code="no-untyped-call"
7-
86
from pathlib import Path
7+
from struct import pack
98
from unittest.mock import ANY
109

1110
import pytest
1211

13-
from cmk.piggyback.backend._inotify import Cookie, Event, INotify, Masks, Watchee
12+
from cmk.piggyback.backend._inotify import _EventParser, Cookie, Event, INotify, Masks, Watchee
1413

1514

1615
@pytest.mark.skip(reason="CMK-26458")
@@ -100,3 +99,14 @@ def test_basic_event_observing(tmp_path: Path) -> None:
10099
]
101100
assert isinstance(actual[4].cookie, Cookie)
102101
assert actual[3].cookie == actual[4].cookie
102+
103+
104+
def test_overflow_event_does_not_crash() -> None:
105+
"""IN_Q_OVERFLOW must not cause a KeyError regardless of the watch descriptor value."""
106+
parser = _EventParser()
107+
# Raw inotify event: wd=-1, mask=IN_Q_OVERFLOW (0x4000), cookie=0, name_len=0
108+
data = pack("iIII", -1, 0x4000, 0, 0)
109+
(event,) = parser.iterate_parsed_events(data)
110+
111+
assert event.type & Masks.Q_OVERFLOW
112+
assert event.watchee == Watchee(-1, Path())

0 commit comments

Comments
 (0)