1515from dask_task_models_library .container_tasks .docker import DockerBasicAuth
1616from dask_task_models_library .container_tasks .errors import (
1717 ServiceInputsUseFileToKeyMapButReceivesZipDataError ,
18+ ServiceOutOfMemoryError ,
1819 ServiceRuntimeError ,
20+ ServiceTimeoutLoggingError ,
1921)
2022from dask_task_models_library .container_tasks .io import FileUrl , TaskOutputData
2123from dask_task_models_library .container_tasks .protocol import ContainerTaskParameters
2224from models_library .progress_bar import ProgressReport
2325from packaging import version
24- from pydantic import ValidationError
26+ from pydantic import ByteSize , TypeAdapter , ValidationError
2527from pydantic .networks import AnyUrl
26- from servicelib .logging_utils import LogLevelInt , LogMessageStr
28+ from servicelib .logging_utils import LogLevelInt , LogMessageStr , log_catch
2729from servicelib .progress_bar import ProgressBarData
2830from settings_library .s3 import S3Settings
2931from yarl import URL
@@ -263,31 +265,55 @@ async def run(self, command: list[str]) -> TaskOutputData:
263265 ):
264266 await container .start ()
265267 await self ._publish_sidecar_log (
266- f"Container started as '{ container .id } ' on { socket .gethostname ()} ..."
268+ f"Service { self . task_parameters . image } : { self . task_parameters . tag } started as '{ container .id } ' on { socket .gethostname ()} ..."
267269 )
268270 # wait until the container finished, either success or fail or timeout
269271 while (container_data := await container .show ())["State" ]["Running" ]:
270272 await asyncio .sleep (CONTAINER_WAIT_TIME_SECS )
273+
274+ async def _safe_get_last_logs () -> list [str ]:
275+ with log_catch (_logger , reraise = False ):
276+ last_logs = await cast (
277+ Coroutine ,
278+ container .log (
279+ stdout = True , stderr = True , tail = 20 , follow = False
280+ ),
281+ )
282+ assert isinstance (last_logs , list ) # nosec
283+ return last_logs
284+ return ["Unexpected error: Could not retrieve logs." ]
285+
286+ # Check for OOMKilled
287+ if container_data ["State" ].get ("OOMKilled" , False ):
288+ raise ServiceOutOfMemoryError (
289+ service_key = self .task_parameters .image ,
290+ service_version = self .task_parameters .tag ,
291+ service_resources = TypeAdapter (ByteSize )
292+ .validate_python (self .task_max_resources .get ("RAM" , 0 ))
293+ .human_readable (),
294+ container_id = container .id ,
295+ service_logs = await _safe_get_last_logs (),
296+ )
297+
271298 if container_data ["State" ]["ExitCode" ] > os .EX_OK :
272299 raise ServiceRuntimeError (
273300 service_key = self .task_parameters .image ,
274301 service_version = self .task_parameters .tag ,
275302 container_id = container .id ,
276303 exit_code = container_data ["State" ]["ExitCode" ],
277- service_logs = await cast (
278- Coroutine ,
279- container .log (
280- stdout = True , stderr = True , tail = 20 , follow = False
281- ),
282- ),
304+ service_logs = await _safe_get_last_logs (),
283305 )
284- await self ._publish_sidecar_log ("Container ran successfully." )
306+ await self ._publish_sidecar_log (
307+ f"Service { self .task_parameters .image } :{ self .task_parameters .tag } completed successfully."
308+ )
285309
286310 # POST-PROCESSING (1 step weighted 5%)
287311 results = await self ._retrieve_output_data (
288312 task_volumes , image_labels .get_integration_version ()
289313 )
290- await self ._publish_sidecar_log ("Task completed successfully." )
314+ await self ._publish_sidecar_log (
315+ f"Uploaded output data of { self .task_parameters .image } :{ self .task_parameters .tag } successfully."
316+ )
291317 return results
292318
293319 async def __aenter__ (self ) -> "ComputationalSidecar" :
@@ -302,11 +328,21 @@ async def __aexit__(
302328 tb : TracebackType | None ,
303329 ) -> None :
304330 if exc :
305- await self ._publish_sidecar_log (
306- f"Task error:\n { exc } " , log_level = logging .ERROR
307- )
308- await self ._publish_sidecar_log (
309- "TIP: There might be more information in the service log file in the service outputs" ,
310- )
331+ if isinstance (exc , asyncio .CancelledError ):
332+ # cancelled errors are not logged as errors
333+ await self ._publish_sidecar_log ("Service was cancelled." )
334+ elif isinstance (exc , ServiceTimeoutLoggingError | ServiceOutOfMemoryError ):
335+ await self ._publish_sidecar_log (f"{ exc } " , log_level = logging .ERROR )
336+ await self ._publish_sidecar_log (
337+ "TIP: There might be more information in the service log to help debug the service issue." ,
338+ )
339+
340+ else :
341+ await self ._publish_sidecar_log (
342+ f"Service error: { exc } " , log_level = logging .ERROR
343+ )
344+ await self ._publish_sidecar_log (
345+ "TIP: There might be more information in the service log to help debug the service issue." ,
346+ )
311347 # ensure we pass the final progress
312348 self .task_publishers .publish_progress (ProgressReport (actual_value = 1 ))
0 commit comments