Skip to content

Commit a786cc6

Browse files
authored
add nvidia container health checks (#1573)
1 parent a39915b commit a786cc6

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

docker-compose.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ services:
103103
girder_worker_pipelines:
104104
# Merge base-worker object with this config
105105
<< : *base-worker
106+
restart: always
106107
deploy:
107108
resources:
108109
reservations:
@@ -116,10 +117,17 @@ services:
116117
- "WORKER_CONCURRENCY=${PIPELINE_WORKER_CONCURRENCY:-1}"
117118
- "WORKER_GPU_UUID=${PIPELINE_GPU_UUID}"
118119
- "CELERY_BROKER_URL=${CELERY_BROKER_URL:-amqp://guest:guest@rabbit/default}"
120+
healthcheck:
121+
test: ["CMD", "nvidia-smi"]
122+
interval: 15m
123+
timeout: 10s
124+
retries: 1
125+
start_period: 1m
119126

120127
girder_worker_training:
121128
# Merge base-worker object with this config
122129
<< : *base-worker
130+
restart: always
123131
deploy:
124132
resources:
125133
reservations:
@@ -132,6 +140,12 @@ services:
132140
- "WORKER_CONCURRENCY=${TRAINING_WORKER_CONCURRENCY:-1}"
133141
- "WORKER_GPU_UUID=${TRAINING_GPU_UUID}"
134142
- "CELERY_BROKER_URL=${CELERY_BROKER_URL:-amqp://guest:guest@rabbit/default}"
143+
healthcheck:
144+
test: ["CMD", "nvidia-smi"]
145+
interval: 15m
146+
timeout: 10s
147+
retries: 1
148+
start_period: 1m
135149

136150
volumes:
137151
addons:

0 commit comments

Comments
 (0)