@@ -99,7 +99,7 @@ def get_dpu_ip(dpu_name: str):
9999 entry = _cfg_get_entry ("DHCP_SERVER_IPV4_PORT" , f"bridge-midplane|{ dpu_name .lower ()} " )
100100 return entry .get ("ips@" )
101101
102- def get_gnmi_port (dpu_name : str ):
102+ def get_dpu_gnmi_port (dpu_name : str ):
103103 variants = [dpu_name , dpu_name .lower (), dpu_name .upper ()]
104104 for k in variants :
105105 entry = _cfg_get_entry ("DPU_PORT" , k )
@@ -167,6 +167,99 @@ def run(self):
167167 logger .log_debug (f"TimeoutEnforcer loop error: { e } " )
168168 self ._stop .wait (self ._interval )
169169
170+ # ###############
171+ # gNOI Reboot Handler
172+ # ###############
173+ class GnoiRebootHandler :
174+ """
175+ Handles gNOI reboot operations for DPU modules, including sending reboot commands
176+ and polling for status completion.
177+ """
178+ def __init__ (self , db , module_base : ModuleBase ):
179+ self ._db = db
180+ self ._mb = module_base
181+
182+ def handle_transition (self , dpu_name : str , transition_type : str ) -> bool :
183+ """
184+ Handle a shutdown or reboot transition for a DPU module.
185+ Returns True if the operation completed successfully, False otherwise.
186+ """
187+ try :
188+ dpu_ip = get_dpu_ip (dpu_name )
189+ port = get_dpu_gnmi_port (dpu_name )
190+ if not dpu_ip :
191+ raise RuntimeError ("DPU IP not found" )
192+ except Exception as e :
193+ logger .log_error (f"Error getting DPU IP or port for { dpu_name } : { e } " )
194+ return False
195+
196+ # skip if TCP is not reachable
197+ if not is_tcp_open (dpu_ip , int (port )):
198+ logger .log_info (f"Skipping { dpu_name } : { dpu_ip } :{ port } unreachable (offline/down)" )
199+ return False
200+
201+ # Send Reboot HALT
202+ if not self ._send_reboot_command (dpu_name , dpu_ip , port ):
203+ return False
204+
205+ # Poll RebootStatus
206+ reboot_successful = self ._poll_reboot_status (dpu_name , dpu_ip , port )
207+
208+ if reboot_successful :
209+ self ._handle_successful_reboot (dpu_name , transition_type )
210+ else :
211+ logger .log_warning (f"Status polling of halting the services on DPU timed out for { dpu_name } ." )
212+
213+ return reboot_successful
214+
215+ def _send_reboot_command (self , dpu_name : str , dpu_ip : str , port : str ) -> bool :
216+ """Send gNOI Reboot HALT command to the DPU."""
217+ logger .log_notice (f"Issuing gNOI Reboot to { dpu_ip } :{ port } " )
218+ reboot_cmd = [
219+ "docker" , "exec" , "gnmi" , "gnoi_client" ,
220+ f"-target={ dpu_ip } :{ port } " ,
221+ "-logtostderr" , "-notls" ,
222+ "-module" , "System" ,
223+ "-rpc" , "Reboot" ,
224+ "-jsonin" , json .dumps ({"method" : REBOOT_METHOD_HALT , "message" : "Triggered by SmartSwitch graceful shutdown" })
225+ ]
226+ rc , out , err = execute_gnoi_command (reboot_cmd , timeout_sec = REBOOT_RPC_TIMEOUT_SEC )
227+ if rc != 0 :
228+ logger .log_error (f"gNOI Reboot command failed for { dpu_name } : { err or out } " )
229+ return False
230+ return True
231+
232+ def _poll_reboot_status (self , dpu_name : str , dpu_ip : str , port : str ) -> bool :
233+ """Poll RebootStatus until completion or timeout."""
234+ logger .log_notice (
235+ f"Polling RebootStatus for { dpu_name } at { dpu_ip } :{ port } "
236+ f"(timeout { STATUS_POLL_TIMEOUT_SEC } s, interval { STATUS_POLL_INTERVAL_SEC } s)"
237+ )
238+ deadline = time .monotonic () + STATUS_POLL_TIMEOUT_SEC
239+ status_cmd = [
240+ "docker" , "exec" , "gnmi" , "gnoi_client" ,
241+ f"-target={ dpu_ip } :{ port } " ,
242+ "-logtostderr" , "-notls" ,
243+ "-module" , "System" ,
244+ "-rpc" , "RebootStatus"
245+ ]
246+ while time .monotonic () < deadline :
247+ rc_s , out_s , err_s = execute_gnoi_command (status_cmd , timeout_sec = STATUS_RPC_TIMEOUT_SEC )
248+ if rc_s == 0 and out_s and ("reboot complete" in out_s .lower ()):
249+ return True
250+ time .sleep (STATUS_POLL_INTERVAL_SEC )
251+ return False
252+
253+ def _handle_successful_reboot (self , dpu_name : str , transition_type : str ):
254+ """Handle successful reboot completion, including clearing transition flags if needed."""
255+ if transition_type == "reboot" :
256+ success = self ._mb .clear_module_state_transition (self ._db , dpu_name )
257+ if success :
258+ logger .log_info (f"Cleared transition for { dpu_name } " )
259+ else :
260+ logger .log_warning (f"Failed to clear transition for { dpu_name } " )
261+ logger .log_info (f"Halting the services on DPU is successful for { dpu_name } ." )
262+
170263# #########
171264# Main loop
172265# #########
@@ -179,6 +272,9 @@ def main():
179272 # Centralized transition reader
180273 module_base = ModuleBase ()
181274
275+ # gNOI reboot handler
276+ reboot_handler = GnoiRebootHandler (db , module_base )
277+
182278 pubsub = _get_pubsub (db )
183279 state_dbid = _get_dbid_state (db )
184280
@@ -218,79 +314,17 @@ def main():
218314 time .sleep (1 )
219315 continue
220316
221- type = entry .get ("transition_type" )
222- if entry .get ("state_transition_in_progress" , "False" ) == "True" and (type == "shutdown" or type == "reboot" ):
223- logger .log_info (f"{ type } request detected for { dpu_name } . Initiating gNOI reboot." )
224- try :
225- dpu_ip = get_dpu_ip (dpu_name )
226- port = get_gnmi_port (dpu_name )
227- if not dpu_ip :
228- raise RuntimeError ("DPU IP not found" )
229- except Exception as e :
230- logger .log_error (f"Error getting DPU IP or port for { dpu_name } : { e } " )
231- time .sleep (1 )
232- continue
233-
234- # skip if TCP is not reachable
235- if not is_tcp_open (dpu_ip , int (port )):
236- logger .log_info (f"Skipping { dpu_name } : { dpu_ip } :{ port } unreachable (offline/down)" )
237- time .sleep (1 )
238- continue
239-
240- # 1) Send Reboot HALT
241- logger .log_notice (f"Issuing gNOI Reboot to { dpu_ip } :{ port } " )
242- reboot_cmd = [
243- "docker" , "exec" , "gnmi" , "gnoi_client" ,
244- f"-target={ dpu_ip } :{ port } " ,
245- "-logtostderr" , "-notls" ,
246- "-module" , "System" ,
247- "-rpc" , "Reboot" ,
248- "-jsonin" , json .dumps ({"method" : REBOOT_METHOD_HALT , "message" : "Triggered by SmartSwitch graceful shutdown" })
249- ]
250- rc , out , err = execute_gnoi_command (reboot_cmd , timeout_sec = REBOOT_RPC_TIMEOUT_SEC )
251- if rc != 0 :
252- logger .log_error (f"gNOI Reboot command failed for { dpu_name } : { err or out } " )
253- # As per HLD, daemon just logs and returns.
254- time .sleep (1 )
255- continue
256-
257- # 2) Poll RebootStatus with a real deadline
258- logger .log_notice (
259- f"Polling RebootStatus for { dpu_name } at { dpu_ip } :{ port } "
260- f"(timeout { STATUS_POLL_TIMEOUT_SEC } s, interval { STATUS_POLL_INTERVAL_SEC } s)"
261- )
262- deadline = time .monotonic () + STATUS_POLL_TIMEOUT_SEC
263- reboot_successful = False
264-
265- status_cmd = [
266- "docker" , "exec" , "gnmi" , "gnoi_client" ,
267- f"-target={ dpu_ip } :{ port } " ,
268- "-logtostderr" , "-notls" ,
269- "-module" , "System" ,
270- "-rpc" , "RebootStatus"
271- ]
272- while time .monotonic () < deadline :
273- rc_s , out_s , err_s = execute_gnoi_command (status_cmd , timeout_sec = STATUS_RPC_TIMEOUT_SEC )
274- if rc_s == 0 and out_s and ("reboot complete" in out_s .lower ()):
275- reboot_successful = True
276- break
277- time .sleep (STATUS_POLL_INTERVAL_SEC )
278-
279- if reboot_successful :
280- if type == "reboot" :
281- success = module_base .clear_module_state_transition (db , dpu_name )
282- if success :
283- logger .log_info (f"Cleared transition for { dpu_name } " )
284- else :
285- logger .log_warning (f"Failed to clear transition for { dpu_name } " )
286- logger .log_info (f"Halting the services on DPU is successful for { dpu_name } ." )
287- else :
288- logger .log_warning (f"Status polling of halting the services on DPU timed out for { dpu_name } ." )
317+ transition_type = entry .get ("transition_type" )
318+ if entry .get ("state_transition_in_progress" , "False" ) == "True" and (transition_type == "shutdown" or transition_type == "reboot" ):
319+ logger .log_info (f"{ transition_type } request detected for { dpu_name } . Initiating gNOI reboot." )
320+ reboot_handler .handle_transition (dpu_name , transition_type )
289321
290322 # NOTE:
291- # The CHASSIS_MODULE_TABLE transition flag is cleared for startup/shutdown in
292- # module_base.py. The daemon does not clear it. For reboot transitions, the
293- # daemon relies on the TimeoutEnforcer thread to clear any stuck transitions.
323+ # For shutdown transitions, the platform clears the transition flag.
324+ # For reboot transitions, the daemon clears it upon successful completion.
325+ # The TimeoutEnforcer thread clears any stuck transitions that exceed timeout.
326+
327+ time .sleep (1 )
294328
295329if __name__ == "__main__" :
296330 main ()
0 commit comments