Skip to content

Commit 5c98c46

Browse files
Addressing review comments
1 parent aeac810 commit 5c98c46

File tree

2 files changed

+107
-72
lines changed

2 files changed

+107
-72
lines changed

scripts/gnoi_shutdown_daemon.py

Lines changed: 106 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_dpu_ip(dpu_name: str):
9999
entry = _cfg_get_entry("DHCP_SERVER_IPV4_PORT", f"bridge-midplane|{dpu_name.lower()}")
100100
return entry.get("ips@")
101101

102-
def get_gnmi_port(dpu_name: str):
102+
def get_dpu_gnmi_port(dpu_name: str):
103103
variants = [dpu_name, dpu_name.lower(), dpu_name.upper()]
104104
for k in variants:
105105
entry = _cfg_get_entry("DPU_PORT", k)
@@ -167,6 +167,99 @@ def run(self):
167167
logger.log_debug(f"TimeoutEnforcer loop error: {e}")
168168
self._stop.wait(self._interval)
169169

170+
# ###############
171+
# gNOI Reboot Handler
172+
# ###############
173+
class GnoiRebootHandler:
174+
"""
175+
Handles gNOI reboot operations for DPU modules, including sending reboot commands
176+
and polling for status completion.
177+
"""
178+
def __init__(self, db, module_base: ModuleBase):
179+
self._db = db
180+
self._mb = module_base
181+
182+
def handle_transition(self, dpu_name: str, transition_type: str) -> bool:
183+
"""
184+
Handle a shutdown or reboot transition for a DPU module.
185+
Returns True if the operation completed successfully, False otherwise.
186+
"""
187+
try:
188+
dpu_ip = get_dpu_ip(dpu_name)
189+
port = get_dpu_gnmi_port(dpu_name)
190+
if not dpu_ip:
191+
raise RuntimeError("DPU IP not found")
192+
except Exception as e:
193+
logger.log_error(f"Error getting DPU IP or port for {dpu_name}: {e}")
194+
return False
195+
196+
# skip if TCP is not reachable
197+
if not is_tcp_open(dpu_ip, int(port)):
198+
logger.log_info(f"Skipping {dpu_name}: {dpu_ip}:{port} unreachable (offline/down)")
199+
return False
200+
201+
# Send Reboot HALT
202+
if not self._send_reboot_command(dpu_name, dpu_ip, port):
203+
return False
204+
205+
# Poll RebootStatus
206+
reboot_successful = self._poll_reboot_status(dpu_name, dpu_ip, port)
207+
208+
if reboot_successful:
209+
self._handle_successful_reboot(dpu_name, transition_type)
210+
else:
211+
logger.log_warning(f"Status polling of halting the services on DPU timed out for {dpu_name}.")
212+
213+
return reboot_successful
214+
215+
def _send_reboot_command(self, dpu_name: str, dpu_ip: str, port: str) -> bool:
216+
"""Send gNOI Reboot HALT command to the DPU."""
217+
logger.log_notice(f"Issuing gNOI Reboot to {dpu_ip}:{port}")
218+
reboot_cmd = [
219+
"docker", "exec", "gnmi", "gnoi_client",
220+
f"-target={dpu_ip}:{port}",
221+
"-logtostderr", "-notls",
222+
"-module", "System",
223+
"-rpc", "Reboot",
224+
"-jsonin", json.dumps({"method": REBOOT_METHOD_HALT, "message": "Triggered by SmartSwitch graceful shutdown"})
225+
]
226+
rc, out, err = execute_gnoi_command(reboot_cmd, timeout_sec=REBOOT_RPC_TIMEOUT_SEC)
227+
if rc != 0:
228+
logger.log_error(f"gNOI Reboot command failed for {dpu_name}: {err or out}")
229+
return False
230+
return True
231+
232+
def _poll_reboot_status(self, dpu_name: str, dpu_ip: str, port: str) -> bool:
233+
"""Poll RebootStatus until completion or timeout."""
234+
logger.log_notice(
235+
f"Polling RebootStatus for {dpu_name} at {dpu_ip}:{port} "
236+
f"(timeout {STATUS_POLL_TIMEOUT_SEC}s, interval {STATUS_POLL_INTERVAL_SEC}s)"
237+
)
238+
deadline = time.monotonic() + STATUS_POLL_TIMEOUT_SEC
239+
status_cmd = [
240+
"docker", "exec", "gnmi", "gnoi_client",
241+
f"-target={dpu_ip}:{port}",
242+
"-logtostderr", "-notls",
243+
"-module", "System",
244+
"-rpc", "RebootStatus"
245+
]
246+
while time.monotonic() < deadline:
247+
rc_s, out_s, err_s = execute_gnoi_command(status_cmd, timeout_sec=STATUS_RPC_TIMEOUT_SEC)
248+
if rc_s == 0 and out_s and ("reboot complete" in out_s.lower()):
249+
return True
250+
time.sleep(STATUS_POLL_INTERVAL_SEC)
251+
return False
252+
253+
def _handle_successful_reboot(self, dpu_name: str, transition_type: str):
254+
"""Handle successful reboot completion, including clearing transition flags if needed."""
255+
if transition_type == "reboot":
256+
success = self._mb.clear_module_state_transition(self._db, dpu_name)
257+
if success:
258+
logger.log_info(f"Cleared transition for {dpu_name}")
259+
else:
260+
logger.log_warning(f"Failed to clear transition for {dpu_name}")
261+
logger.log_info(f"Halting the services on DPU is successful for {dpu_name}.")
262+
170263
# #########
171264
# Main loop
172265
# #########
@@ -179,6 +272,9 @@ def main():
179272
# Centralized transition reader
180273
module_base = ModuleBase()
181274

275+
# gNOI reboot handler
276+
reboot_handler = GnoiRebootHandler(db, module_base)
277+
182278
pubsub = _get_pubsub(db)
183279
state_dbid = _get_dbid_state(db)
184280

@@ -218,79 +314,17 @@ def main():
218314
time.sleep(1)
219315
continue
220316

221-
type = entry.get("transition_type")
222-
if entry.get("state_transition_in_progress", "False") == "True" and (type == "shutdown" or type == "reboot"):
223-
logger.log_info(f"{type} request detected for {dpu_name}. Initiating gNOI reboot.")
224-
try:
225-
dpu_ip = get_dpu_ip(dpu_name)
226-
port = get_gnmi_port(dpu_name)
227-
if not dpu_ip:
228-
raise RuntimeError("DPU IP not found")
229-
except Exception as e:
230-
logger.log_error(f"Error getting DPU IP or port for {dpu_name}: {e}")
231-
time.sleep(1)
232-
continue
233-
234-
# skip if TCP is not reachable
235-
if not is_tcp_open(dpu_ip, int(port)):
236-
logger.log_info(f"Skipping {dpu_name}: {dpu_ip}:{port} unreachable (offline/down)")
237-
time.sleep(1)
238-
continue
239-
240-
# 1) Send Reboot HALT
241-
logger.log_notice(f"Issuing gNOI Reboot to {dpu_ip}:{port}")
242-
reboot_cmd = [
243-
"docker", "exec", "gnmi", "gnoi_client",
244-
f"-target={dpu_ip}:{port}",
245-
"-logtostderr", "-notls",
246-
"-module", "System",
247-
"-rpc", "Reboot",
248-
"-jsonin", json.dumps({"method": REBOOT_METHOD_HALT, "message": "Triggered by SmartSwitch graceful shutdown"})
249-
]
250-
rc, out, err = execute_gnoi_command(reboot_cmd, timeout_sec=REBOOT_RPC_TIMEOUT_SEC)
251-
if rc != 0:
252-
logger.log_error(f"gNOI Reboot command failed for {dpu_name}: {err or out}")
253-
# As per HLD, daemon just logs and returns.
254-
time.sleep(1)
255-
continue
256-
257-
# 2) Poll RebootStatus with a real deadline
258-
logger.log_notice(
259-
f"Polling RebootStatus for {dpu_name} at {dpu_ip}:{port} "
260-
f"(timeout {STATUS_POLL_TIMEOUT_SEC}s, interval {STATUS_POLL_INTERVAL_SEC}s)"
261-
)
262-
deadline = time.monotonic() + STATUS_POLL_TIMEOUT_SEC
263-
reboot_successful = False
264-
265-
status_cmd = [
266-
"docker", "exec", "gnmi", "gnoi_client",
267-
f"-target={dpu_ip}:{port}",
268-
"-logtostderr", "-notls",
269-
"-module", "System",
270-
"-rpc", "RebootStatus"
271-
]
272-
while time.monotonic() < deadline:
273-
rc_s, out_s, err_s = execute_gnoi_command(status_cmd, timeout_sec=STATUS_RPC_TIMEOUT_SEC)
274-
if rc_s == 0 and out_s and ("reboot complete" in out_s.lower()):
275-
reboot_successful = True
276-
break
277-
time.sleep(STATUS_POLL_INTERVAL_SEC)
278-
279-
if reboot_successful:
280-
if type == "reboot":
281-
success = module_base.clear_module_state_transition(db, dpu_name)
282-
if success:
283-
logger.log_info(f"Cleared transition for {dpu_name}")
284-
else:
285-
logger.log_warning(f"Failed to clear transition for {dpu_name}")
286-
logger.log_info(f"Halting the services on DPU is successful for {dpu_name}.")
287-
else:
288-
logger.log_warning(f"Status polling of halting the services on DPU timed out for {dpu_name}.")
317+
transition_type = entry.get("transition_type")
318+
if entry.get("state_transition_in_progress", "False") == "True" and (transition_type == "shutdown" or transition_type == "reboot"):
319+
logger.log_info(f"{transition_type} request detected for {dpu_name}. Initiating gNOI reboot.")
320+
reboot_handler.handle_transition(dpu_name, transition_type)
289321

290322
# NOTE:
291-
# The CHASSIS_MODULE_TABLE transition flag is cleared for startup/shutdown in
292-
# module_base.py. The daemon does not clear it. For reboot transitions, the
293-
# daemon relies on the TimeoutEnforcer thread to clear any stuck transitions.
323+
# For shutdown transitions, the platform clears the transition flag.
324+
# For reboot transitions, the daemon clears it upon successful completion.
325+
# The TimeoutEnforcer thread clears any stuck transitions that exceed timeout.
326+
327+
time.sleep(1)
294328

295329
if __name__ == "__main__":
296330
main()

tests/gnoi_shutdown_daemon_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ def _mk_pubsub_once2():
567567
return pubsub
568568

569569

570+
class TestGnoiShutdownDaemonAdditional(unittest.TestCase):
570571
def test_shutdown_skips_when_port_closed(self):
571572
with patch("gnoi_shutdown_daemon.SonicV2Connector") as mock_sonic, \
572573
patch("gnoi_shutdown_daemon.ModuleBase", new=_MBStub2), \

0 commit comments

Comments
 (0)