Skip to content

Commit 67db6cc

Browse files
bugclerkbmeagherix
andauthored
NAS-140250 / 26.0.0-BETA.1 / Fix keepalived boot deadlock in configure_addresses_impl (by bmeagherix) (#18440)
ix-netif.service runs Before=network-pre.target, but keepalived requires After=network-online.target. Starting keepalived from configure_addresses_impl (called via ix-netif.service) caused systemd to queue the start job for ~95s until network-online.target was eventually satisfied after ix-netif.service completed - a structural deadlock. Fix by guarding the keepalived START behind the ix-netif completion sentinel. If keepalived is already running, RELOAD as before. If it is not running and the sentinel exists (i.e. we are in a post-boot interface.sync call), START it. If the sentinel does not exist we are in the early boot call and skip keepalived entirely; it will be started once the network is online. Move NETIF_COMPLETE_SENTINEL from smb_/constants.py to the more appropriate middlewared/utils/interface.py and update importers accordingly. Original PR: #18437 Co-authored-by: Brian M <brian.meagher@ixsystems.com>
1 parent d35495b commit 67db6cc

File tree

4 files changed

+27
-18
lines changed

4 files changed

+27
-18
lines changed

src/middlewared/middlewared/plugins/interface/addresses.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import ipaddress
4+
import os
45
import socket
56

67
from truenas_pynetif.address.address import add_address, remove_address, replace_address
@@ -11,6 +12,7 @@
1112

1213
from middlewared.plugins.interface.dhcp import dhcp_leases, dhcp_status, dhcp_stop
1314
from middlewared.service import ServiceContext
15+
from middlewared.utils.interface import NETIF_COMPLETE_SENTINEL
1416

1517
from .sync_data import SyncData
1618

@@ -170,17 +172,6 @@ def configure_addresses_impl(
170172
"tunable.set_sysctl", f"net.ipv6.conf.{name}.autoconf", autoconf
171173
)
172174

173-
# Handle keepalived for VIPs
174-
if vip or alias_vips:
175-
if not ctx.middleware.call_sync("service.started", "keepalived"):
176-
ctx.middleware.call_sync(
177-
"service.control", "START", "keepalived"
178-
).wait_sync(raise_error=True)
179-
else:
180-
ctx.middleware.call_sync(
181-
"service.control", "RELOAD", "keepalived"
182-
).wait_sync(raise_error=True)
183-
184175
# Add addresses in database but not configured
185176
for addr in addrs_database - addrs_configured:
186177
address = addr.address
@@ -195,6 +186,27 @@ def configure_addresses_impl(
195186
broadcast=addr.broadcast,
196187
)
197188

189+
# Bring interface up
190+
if not (link.flags & IFFlags.UP):
191+
set_link_up(sock, index=link_index)
192+
193+
# Handle keepalived for VIPs. Skip the START during early boot (when called
194+
# from ix-netif.service) because keepalived requires network-online.target
195+
# which cannot be reached until ix-netif.service itself completes — starting
196+
# it here would deadlock for 95s. The sentinel is written by ix-netif.service
197+
# on successful completion, so its presence means the network is up.
198+
if vip or alias_vips:
199+
if ctx.middleware.call_sync("service.started", "keepalived"):
200+
ctx.middleware.call_sync(
201+
"service.control", "RELOAD", "keepalived"
202+
).wait_sync(raise_error=True)
203+
elif os.path.exists(NETIF_COMPLETE_SENTINEL):
204+
ctx.middleware.call_sync(
205+
"service.control", "START", "keepalived"
206+
).wait_sync(raise_error=True)
207+
# else: early boot call from ix-netif.service; keepalived will be
208+
# started later once network-online.target is satisfied.
209+
198210
# Configure MTU (skip for bond members)
199211
skip_mtu = sync_data.is_bond_member(name)
200212
if not skip_mtu:
@@ -213,9 +225,5 @@ def configure_addresses_impl(
213225
"Failed to set interface description on %s", name, exc_info=True
214226
)
215227

216-
# Bring interface up
217-
if not (link.flags & IFFlags.UP):
218-
set_link_up(sock, index=link_index)
219-
220228
# Return True if DHCP should be started
221229
return not status.running and data["int_dhcp"]

src/middlewared/middlewared/plugins/smb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@
2525
from middlewared.service import ConfigService, ValidationError, ValidationErrors
2626
from middlewared.service_exception import CallError, MatchNotFound
2727
from middlewared.plugins.smb_.constants import (
28-
NETIF_COMPLETE_SENTINEL,
2928
CONFIGURED_SENTINEL,
3029
SMB_AUDIT_DEFAULTS,
3130
SMBCmd,
3231
SMBPath,
3332
)
33+
from middlewared.utils.interface import NETIF_COMPLETE_SENTINEL
3434
from middlewared.plugins.smb_.constants import VEEAM_REPO_BLOCKSIZE
3535
from middlewared.plugins.smb_.constants import SMBShareField as share_field
3636
from middlewared.plugins.smb_.sharesec import remove_share_acl

src/middlewared/middlewared/plugins/smb_/constants.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import enum
22
import os
33
from middlewared.utils import ctdb
4-
from middlewared.utils import MIDDLEWARE_RUN_DIR
54
from middlewared.utils.directoryservices.krb5_constants import SAMBA_KEYTAB_DIR
65

76

8-
NETIF_COMPLETE_SENTINEL = f"{MIDDLEWARE_RUN_DIR}/ix-netif-complete"
97
CONFIGURED_SENTINEL = '/var/run/samba/.configured'
108
SMB_AUDIT_DEFAULTS = {'enable': False, 'watch_list': [], 'ignore_list': []}
119
VEEAM_REPO_BLOCKSIZE = 131072

src/middlewared/middlewared/utils/interface.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
import os
33
import time
44

5+
from middlewared.utils import MIDDLEWARE_RUN_DIR
6+
57

68
IFACE_LINK_STATE_MAX_WAIT: int = 60
9+
NETIF_COMPLETE_SENTINEL = f"{MIDDLEWARE_RUN_DIR}/ix-netif-complete"
710
RTF_GATEWAY: int = 0x0002
811
RTF_UP: int = 0x0001
912

0 commit comments

Comments
 (0)