1+ import asyncio
2+ import contextlib
13import os
4+ import sys
25
36import aiorun
47import grpc
58
9+ from pynumaflow .info .server import write as info_server_write
610from pynumaflow .info .types import ContainerType , ServerInfo , MINIMUM_NUMAFLOW_VERSION
711from pynumaflow .sinker .servicer .async_servicer import AsyncSinkServicer
812from pynumaflow .proto .sinker import sink_pb2_grpc
2226 ON_SUCCESS_SINK_SOCK_PATH ,
2327 ON_SUCCESS_SINK_SERVER_INFO_FILE_PATH ,
2428 MAX_NUM_THREADS ,
29+ NUMAFLOW_GRPC_SHUTDOWN_GRACE_PERIOD_SECONDS ,
2530)
2631
27- from pynumaflow .shared .server import NumaflowServer , start_async_server
32+ from pynumaflow .shared .server import NumaflowServer
2833from pynumaflow .sinker ._dtypes import SinkAsyncCallable
2934
3035
@@ -118,13 +123,17 @@ def __init__(
118123 ]
119124
120125 self .servicer = AsyncSinkServicer (sinker_instance )
126+ self ._error : BaseException | None = None
121127
122128 def start (self ):
123129 """
124130 Starter function for the Async server class, need a separate caller
125131 so that all the async coroutines can be started from a single context
126132 """
127133 aiorun .run (self .aexec (), use_uvloop = True , shutdown_callback = self .shutdown_callback )
134+ if self ._error :
135+ _LOGGER .critical ("Server exiting due to UDF error: %s" , self ._error )
136+ sys .exit (1 )
128137
129138 async def aexec (self ):
130139 """
@@ -133,17 +142,52 @@ async def aexec(self):
133142 # As the server is async, we need to create a new server instance in the
134143 # same thread as the event loop so that all the async calls are made in the
135144 # same context
136- # Create a new server instance, add the servicer to it and start the server
137145 server = grpc .aio .server (options = self ._server_options )
138146 server .add_insecure_port (self .sock_path )
147+
148+ # The asyncio.Event must be created here (inside aexec) rather than in __init__,
149+ # because it must be bound to the running event loop that aiorun creates.
150+ # At __init__ time no event loop exists yet.
151+ shutdown_event = asyncio .Event ()
152+ self .servicer .set_shutdown_event (shutdown_event )
153+
139154 sink_pb2_grpc .add_SinkServicer_to_server (self .servicer , server )
155+
140156 serv_info = ServerInfo .get_default_server_info ()
141157 serv_info .minimum_numaflow_version = MINIMUM_NUMAFLOW_VERSION [ContainerType .Sinker ]
142- await start_async_server (
143- server_async = server ,
144- sock_path = self .sock_path ,
145- max_threads = self .max_threads ,
146- cleanup_coroutines = list (),
147- server_info_file = self .server_info_file ,
148- server_info = serv_info ,
158+
159+ await server .start ()
160+ info_server_write (server_info = serv_info , info_file = self .server_info_file )
161+
162+ _LOGGER .info (
163+ "Async GRPC Server listening on: %s with max threads: %s" ,
164+ self .sock_path ,
165+ self .max_threads ,
149166 )
167+
168+ async def _watch_for_shutdown ():
169+ """Wait for the shutdown event and stop the server with a grace period."""
170+ await shutdown_event .wait ()
171+ _LOGGER .info ("Shutdown signal received, stopping server gracefully..." )
172+ # Stop accepting new requests and wait for a maximum of
173+ # NUMAFLOW_GRPC_SHUTDOWN_GRACE_PERIOD_SECONDS seconds for in-flight requests to complete
174+ await server .stop (NUMAFLOW_GRPC_SHUTDOWN_GRACE_PERIOD_SECONDS )
175+
176+ shutdown_task = asyncio .create_task (_watch_for_shutdown ())
177+ await server .wait_for_termination ()
178+
179+ # Propagate error so start() can exit with a non-zero code
180+ self ._error = self .servicer ._error
181+
182+ shutdown_task .cancel ()
183+ with contextlib .suppress (asyncio .CancelledError ):
184+ await shutdown_task
185+
186+ _LOGGER .info ("Stopping event loop..." )
187+ # We use aiorun to manage the event loop. The aiorun.run() runs
188+ # forever until loop.stop() is called. If we don't stop the
189+ # event loop explicitly here, the python process will not exit.
190+ # It reamins stuck for 5 minutes until liveness and readiness probe
191+ # fails enough times and k8s sends a SIGTERM
192+ asyncio .get_event_loop ().stop ()
193+ _LOGGER .info ("Event loop stopped" )
0 commit comments