@@ -167,12 +167,12 @@ def run_job(self, job: JobExecutorInterface):
167
167
get_uuid (f"{ self .run_namespace } -{ job .jobid } -{ job .attempt } " )
168
168
)
169
169
170
- body = kubernetes .client .V1Pod ()
170
+ body = kubernetes .client .V1Job ()
171
171
body .metadata = kubernetes .client .V1ObjectMeta (labels = {"app" : "snakemake" })
172
172
body .metadata .name = jobid
173
173
174
174
# Container setup
175
- container = kubernetes .client .V1Container (name = jobid )
175
+ container = kubernetes .client .V1Container (name = "snakemake" )
176
176
container .image = self .container_image
177
177
container .command = shlex .split ("/bin/sh" )
178
178
container .args = ["-c" , exec_job ]
@@ -196,9 +196,13 @@ def run_job(self, job: JobExecutorInterface):
196
196
self .logger .debug (f"Set node selector for machine type: { node_selector } " )
197
197
198
198
# Initialize PodSpec
199
- body . spec = kubernetes .client .V1PodSpec (
199
+ pod_spec = kubernetes .client .V1PodSpec (
200
200
containers = [container ], node_selector = node_selector , restart_policy = "Never"
201
201
)
202
+ body .spec = kubernetes .client .V1JobSpec (
203
+ backoff_limit = 0 ,
204
+ template = kubernetes .client .V1PodTemplateSpec (spec = pod_spec ),
205
+ )
202
206
203
207
# Add toleration for GPU nodes if GPU is requested
204
208
if "gpu" in resources_dict :
@@ -212,9 +216,9 @@ def run_job(self, job: JobExecutorInterface):
212
216
manufacturer_lc = manufacturer .lower ()
213
217
if manufacturer_lc == "nvidia" :
214
218
# Toleration for nvidia.com/gpu
215
- if body . spec .tolerations is None :
216
- body . spec .tolerations = []
217
- body . spec .tolerations .append (
219
+ if pod_spec .tolerations is None :
220
+ pod_spec .tolerations = []
221
+ pod_spec .tolerations .append (
218
222
kubernetes .client .V1Toleration (
219
223
key = "nvidia.com/gpu" ,
220
224
operator = "Equal" ,
@@ -223,14 +227,14 @@ def run_job(self, job: JobExecutorInterface):
223
227
)
224
228
)
225
229
self .logger .debug (
226
- f"Added toleration for NVIDIA GPU: { body . spec .tolerations } "
230
+ f"Added toleration for NVIDIA GPU: { pod_spec .tolerations } "
227
231
)
228
232
229
233
elif manufacturer_lc == "amd" :
230
234
# Toleration for amd.com/gpu
231
- if body . spec .tolerations is None :
232
- body . spec .tolerations = []
233
- body . spec .tolerations .append (
235
+ if pod_spec .tolerations is None :
236
+ pod_spec .tolerations = []
237
+ pod_spec .tolerations .append (
234
238
kubernetes .client .V1Toleration (
235
239
key = "amd.com/gpu" ,
236
240
operator = "Equal" ,
@@ -239,7 +243,7 @@ def run_job(self, job: JobExecutorInterface):
239
243
)
240
244
)
241
245
self .logger .debug (
242
- f"Added toleration for AMD GPU: { body . spec .tolerations } "
246
+ f"Added toleration for AMD GPU: { pod_spec .tolerations } "
243
247
)
244
248
245
249
else :
@@ -273,15 +277,15 @@ def run_job(self, job: JobExecutorInterface):
273
277
274
278
# Add service account name if provided
275
279
if self .k8s_service_account_name :
276
- body . spec .service_account_name = self .k8s_service_account_name
280
+ pod_spec .service_account_name = self .k8s_service_account_name
277
281
self .logger .debug (
278
282
f"Set service account name: { self .k8s_service_account_name } "
279
283
)
280
284
281
285
# Workdir volume
282
286
workdir_volume = kubernetes .client .V1Volume (name = "workdir" )
283
287
workdir_volume .empty_dir = kubernetes .client .V1EmptyDirVolumeSource ()
284
- body . spec .volumes = [workdir_volume ]
288
+ pod_spec .volumes = [workdir_volume ]
285
289
286
290
for pvc in self .persistent_volumes :
287
291
volume = kubernetes .client .V1Volume (name = pvc .name )
@@ -290,7 +294,7 @@ def run_job(self, job: JobExecutorInterface):
290
294
claim_name = pvc .name
291
295
)
292
296
)
293
- body . spec .volumes .append (volume )
297
+ pod_spec .volumes .append (volume )
294
298
295
299
# Env vars
296
300
container .env = []
@@ -378,7 +382,7 @@ def run_job(self, job: JobExecutorInterface):
378
382
# Try creating the pod with exception handling
379
383
try :
380
384
pod = self ._kubernetes_retry (
381
- lambda : self .kubeapi . create_namespaced_pod (self .namespace , body )
385
+ lambda : self .batchapi . create_namespaced_job (self .namespace , body )
382
386
)
383
387
except kubernetes .client .rest .ApiException as e :
384
388
self .logger .error (f"Failed to create pod: { e } " )
@@ -416,7 +420,7 @@ async def check_active_jobs(
416
420
async with self .status_rate_limiter :
417
421
try :
418
422
res = self ._kubernetes_retry (
419
- lambda : self .kubeapi . read_namespaced_pod_status (
423
+ lambda : self .batchapi . read_namespaced_job_status (
420
424
j .external_jobid , self .namespace
421
425
)
422
426
)
@@ -436,34 +440,66 @@ async def check_active_jobs(
436
440
self .report_job_error (j , msg = str (e ))
437
441
continue
438
442
443
+ # Sometimes, just checking the status of a job is not enough, because
444
+ # apparently, depending on the cluster setup, there can be additional
445
+ # containers injected into pods that will prevent the job to detect
446
+ # that a pod is already terminated.
447
+ # We therefore check the status of the snakemake container in addition
448
+ # to the job status.
449
+ pods = self .kubeapi .list_namespaced_pod (
450
+ namespace = self .namespace ,
451
+ label_selector = f"job-name={ j .external_jobid } " ,
452
+ )
453
+ assert len (pods .items ) <= 1
454
+ if pods .items :
455
+ pod = pods .items [0 ]
456
+ snakemake_container = [
457
+ container
458
+ for container in pod .status .container_statuses
459
+ if container .name == "snakemake"
460
+ ][0 ]
461
+ snakemake_container_exit_code = (
462
+ snakemake_container .state .terminated .exit_code
463
+ if snakemake_container .state .terminated is not None
464
+ else None
465
+ )
466
+ else :
467
+ snakemake_container = None
468
+ snakemake_container_exit_code = None
469
+
439
470
if res is None :
440
471
msg = (
441
- "Unknown pod {jobid}. Has the pod been deleted manually?"
472
+ "Unknown job {jobid}. Has the job been deleted manually?"
442
473
).format (jobid = j .external_jobid )
443
474
self .logger .error (msg )
444
475
self .report_job_error (j , msg = msg )
445
- elif res .status .phase == "Failed" :
476
+ elif res .status .failed == 1 or (
477
+ snakemake_container_exit_code is not None
478
+ and snakemake_container_exit_code != 0
479
+ ):
446
480
msg = (
447
481
"For details, please issue:\n "
448
- "kubectl describe pod {jobid}\n "
449
- "kubectl logs {jobid}"
450
- ).format (jobid = j .external_jobid )
451
- # failed
452
- kube_log_content = self .kubeapi .read_namespaced_pod_log (
453
- name = j .external_jobid , namespace = self .namespace
482
+ f"kubectl describe job { j .external_jobid } \n "
483
+ f"kubectl logs { j .external_jobid } "
454
484
)
485
+ # failed
455
486
kube_log = self .log_path / f"{ j .external_jobid } .log"
456
487
with open (kube_log , "w" ) as f :
457
- f .write (kube_log_content )
488
+ kube_log_content = self .kubeapi .read_namespaced_pod_log (
489
+ name = pod .metadata .name ,
490
+ namespace = self .namespace ,
491
+ container = snakemake_container .name ,
492
+ )
493
+ print (kube_log_content , file = f )
458
494
self .logger .error (f"Job { j .external_jobid } failed. { msg } " )
459
495
self .report_job_error (j , msg = msg , aux_logs = [str (kube_log )])
460
- elif res .status .phase == "Succeeded" :
496
+ elif res .status .succeeded == 1 or ( snakemake_container_exit_code == 0 ) :
461
497
# finished
462
498
self .logger .info (f"Job { j .external_jobid } succeeded." )
463
499
self .report_job_success (j )
464
500
465
501
self ._kubernetes_retry (
466
- lambda : self .safe_delete_pod (
502
+ lambda : self .safe_delete_job (
467
503
j .external_jobid , ignore_not_found = True
468
504
)
469
505
)
@@ -476,7 +512,9 @@ def cancel_jobs(self, active_jobs: List[SubmittedJobInfo]):
476
512
# Cancel all active jobs.
477
513
for j in active_jobs :
478
514
self ._kubernetes_retry (
479
- lambda : self .safe_delete_pod (j .external_jobid , ignore_not_found = True )
515
+ lambda jobid = j .external_jobid : self .safe_delete_job (
516
+ jobid , ignore_not_found = True
517
+ )
480
518
)
481
519
482
520
def shutdown (self ):
@@ -521,16 +559,18 @@ def unregister_secret(self):
521
559
)
522
560
)
523
561
524
- def safe_delete_pod (self , jobid , ignore_not_found = True ):
562
+ def safe_delete_job (self , jobid , ignore_not_found = True ):
525
563
import kubernetes .client
526
564
527
565
body = kubernetes .client .V1DeleteOptions ()
528
566
try :
529
- self .kubeapi .delete_namespaced_pod (jobid , self .namespace , body = body )
567
+ self .batchapi .delete_namespaced_job (
568
+ jobid , self .namespace , propagation_policy = "Foreground" , body = body
569
+ )
530
570
except kubernetes .client .rest .ApiException as e :
531
571
if e .status == 404 and ignore_not_found :
532
572
self .logger .warning (
533
- "[WARNING] 404 not found when trying to delete the pod : {jobid}\n "
573
+ "[WARNING] 404 not found when trying to delete the job : {jobid}\n "
534
574
"[WARNING] Ignore this error\n " .format (jobid = jobid )
535
575
)
536
576
else :
0 commit comments