|
| 1 | +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary |
| 3 | +# |
| 4 | +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual |
| 5 | +# property and proprietary rights in and to this material, related |
| 6 | +# documentation and any modifications thereto. Any use, reproduction, |
| 7 | +# disclosure or distribution of this material and related documentation |
| 8 | +# without an express license agreement from NVIDIA CORPORATION or |
| 9 | +# its affiliates is strictly prohibited. |
| 10 | + |
| 11 | +# docker compose -f docker_compose.yaml up -d |
| 12 | +services: |
| 13 | + customizer: |
| 14 | + image: ${CUSTOMIZER_IMAGE:-""} |
| 15 | + container_name: nemo-customizer |
| 16 | + restart: on-failure |
| 17 | + ports: |
| 18 | + - "8001:8001" |
| 19 | + volumes: |
| 20 | + - ./customizer:/mount/cfg |
| 21 | + # map a path to model if already exists |
| 22 | + # otherwise, the model will be auotmatically downloaded from NGC to /app/models and /app |
| 23 | + # Ex: /raid/models/llama-3_1-8b-instruct:/app/models/llama-3_1-8b-instruct |
| 24 | + # - <INSERT_ABS_PATH_TO_MODEL>/llama-3_1-8b-instruct:/app/models/llama-3_1-8b-instruct |
| 25 | + environment: |
| 26 | + - CONFIG_PATH=/mount/cfg/customizer_config.yaml |
| 27 | + - DB_HOST=nemo-postgresql |
| 28 | + - DB_PORT=5432 |
| 29 | + - DB_USER=test_user |
| 30 | + - DB_PASSWORD=1234 |
| 31 | + - DB_NAME=customizer |
| 32 | + - PORT=8001 |
| 33 | + - NGC_API_KEY=${NGC_API_KEY:-""} |
| 34 | + - OTEL_SDK_DISABLED=true |
| 35 | + healthcheck: |
| 36 | + test: ["CMD", "curl", "http://localhost:8001/v1/health/live"] |
| 37 | + interval: 10s |
| 38 | + timeout: 3s |
| 39 | + retries: 3 |
| 40 | + depends_on: |
| 41 | + nemo-postgresql: |
| 42 | + condition: service_healthy |
| 43 | + entity-store: |
| 44 | + condition: service_started |
| 45 | + data-store: |
| 46 | + condition: service_started |
| 47 | + networks: |
| 48 | + - nemo-ms |
| 49 | + deploy: |
| 50 | + resources: |
| 51 | + reservations: |
| 52 | + devices: |
| 53 | + - driver: nvidia |
| 54 | + capabilities: [gpu] |
| 55 | + count: all |
| 56 | + shm_size: "1G" |
| 57 | + |
| 58 | + entity-store: |
| 59 | + image: ${ENTITY_STORE_IMAGE:-""} |
| 60 | + platform: linux/amd64 |
| 61 | + container_name: nemo-entity-store |
| 62 | + restart: on-failure |
| 63 | + ports: |
| 64 | + - "8003:8000" |
| 65 | + environment: |
| 66 | + - POSTGRES_PASSWORD=1234 |
| 67 | + - POSTGRES_USER=test_user |
| 68 | + - POSTGRES_HOST=nemo-postgresql |
| 69 | + - POSTGRES_DB=entity-store |
| 70 | + - BASE_URL_DATASTORE=http://data-store:3000/v1/hf |
| 71 | + - BASE_URL_NIM=http://nim:8002 |
| 72 | + depends_on: |
| 73 | + entity-store-initializer: |
| 74 | + condition: service_completed_successfully |
| 75 | + networks: |
| 76 | + - nemo-ms |
| 77 | + |
| 78 | + entity-store-initializer: |
| 79 | + image: ${ENTITY_STORE_IMAGE:-""} |
| 80 | + platform: linux/amd64 |
| 81 | + working_dir: /app/services/entity-store |
| 82 | + environment: |
| 83 | + - POSTGRES_PASSWORD=1234 |
| 84 | + - POSTGRES_USER=test_user |
| 85 | + - POSTGRES_HOST=nemo-postgresql |
| 86 | + - POSTGRES_DB=entity-store |
| 87 | + depends_on: |
| 88 | + nemo-postgresql: |
| 89 | + condition: service_healthy |
| 90 | + entrypoint: ["/app/.venv/bin/python3", "-m", "scripts.run_db_migration"] |
| 91 | + networks: |
| 92 | + - nemo-ms |
| 93 | + |
| 94 | + evaluator: |
| 95 | + image: ${EVALUATOR_IMAGE:-""} |
| 96 | + container_name: nemo-evaluator |
| 97 | + restart: on-failure |
| 98 | + ports: |
| 99 | + - 7331:7331 |
| 100 | + depends_on: |
| 101 | + data-store: |
| 102 | + condition: service_started |
| 103 | + nemo-postgresql: |
| 104 | + condition: service_healthy |
| 105 | + evaluator-postgres-db-migration: |
| 106 | + condition: service_completed_successfully |
| 107 | + otel-collector: |
| 108 | + condition: service_started |
| 109 | + networks: |
| 110 | + - nemo-ms |
| 111 | + healthcheck: |
| 112 | + test: ["CMD", "curl", "http://localhost:7331/health"] |
| 113 | + interval: 10s |
| 114 | + timeout: 3s |
| 115 | + retries: 3 |
| 116 | + environment: |
| 117 | + MODE: standalone |
| 118 | + # Dependencies |
| 119 | + POSTGRES_URI: postgresql://test_user:1234@nemo-postgresql:5432/evaluation |
| 120 | + ARGO_HOST: none |
| 121 | + NAMESPACE: nemo-evaluation |
| 122 | + DATA_STORE_URL: http://data-store:3000/v1/hf |
| 123 | + EVAL_CONTAINER: ${EVALUATOR_IMAGE} |
| 124 | + SERVICE_ACCOUNT: nemo-evaluator-test-workflow-executor |
| 125 | + EVAL_ENABLE_VALIDATION: False |
| 126 | + # OpenTelemetry environmental variables |
| 127 | + OTEL_SERVICE_NAME: nemo-evaluator |
| 128 | + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 |
| 129 | + OTEL_TRACES_EXPORTER: otlp |
| 130 | + OTEL_METRICS_EXPORTER: none |
| 131 | + OTEL_LOGS_EXPORTER: otlp |
| 132 | + OTEL_PYTHON_EXCLUDED_URLS: "health" |
| 133 | + OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED: "true" |
| 134 | + CONSOLE_LOG_LEVEL: DEBUG |
| 135 | + OTEL_LOG_LEVEL: DEBUG |
| 136 | + LOG_LEVEL: DEBUG |
| 137 | + |
| 138 | + evaluator-postgres-db-migration: |
| 139 | + image: ${EVALUATOR_IMAGE:-""} |
| 140 | + environment: |
| 141 | + MODE: standalone |
| 142 | + POSTGRES_URI: postgresql://test_user:1234@nemo-postgresql:5432/evaluation |
| 143 | + DATA_STORE_URL: none |
| 144 | + ARGO_HOST: none |
| 145 | + NAMESPACE: none |
| 146 | + EVAL_CONTAINER: none |
| 147 | + LOG_LEVEL: INFO |
| 148 | + entrypoint: /bin/sh |
| 149 | + command: ["-c", "/app/scripts/run-db-migration.sh"] |
| 150 | + depends_on: |
| 151 | + nemo-postgresql: |
| 152 | + condition: service_healthy |
| 153 | + networks: |
| 154 | + - nemo-ms |
| 155 | + |
| 156 | + nemo-postgresql: |
| 157 | + image: bitnami/postgresql:16.1.0-debian-11-r20 |
| 158 | + container_name: nemo-postgresql |
| 159 | + platform: linux/amd64 |
| 160 | + restart: unless-stopped |
| 161 | + environment: |
| 162 | + - POSTGRESQL_VOLUME_DIR=/bitnami/postgresql |
| 163 | + - PGDATA=/bitnami/postgresql/data |
| 164 | + - POSTGRES_USER=test_user |
| 165 | + - POSTGRES_PASSWORD=1234 |
| 166 | + - POSTGRES_DATABASE=postgres |
| 167 | + # List of databases to create if they do not exist |
| 168 | + - DATABASES=entity-store,ndsdb,customizer,evaluation |
| 169 | + ports: |
| 170 | + - "5432:5432" |
| 171 | + volumes: |
| 172 | + - nemo-postgresql:/bitnami/postgresql:rw |
| 173 | + - ./init_scripts:/docker-entrypoint-initdb.d:ro |
| 174 | + networks: |
| 175 | + - nemo-ms |
| 176 | + healthcheck: |
| 177 | + test: ["CMD-SHELL", "pg_isready -U $${POSTGRES_USER} -d $${POSTGRES_DATABASE}"] |
| 178 | + interval: 10s |
| 179 | + timeout: 3s |
| 180 | + retries: 3 |
| 181 | + |
| 182 | + data-store-volume-init: |
| 183 | + image: busybox |
| 184 | + command: ["sh", "-c", "chmod -R 777 /nds-data"] |
| 185 | + volumes: |
| 186 | + - nemo-data-store:/nds-data |
| 187 | + restart: no |
| 188 | + deploy: |
| 189 | + restart_policy: |
| 190 | + condition: none |
| 191 | + |
| 192 | + data-store: |
| 193 | + image: ${DATA_STORE_IMAGE:-""} |
| 194 | + platform: linux/amd64 |
| 195 | + container_name: nemo-data-store |
| 196 | + restart: on-failure |
| 197 | + environment: |
| 198 | + - USER_UID=${USER_ID} # match this to the UID of the owner of the data directory |
| 199 | + - USER_GID=${GROUP_ID} # match this to the GID of the owner of the data directory |
| 200 | + - APP_NAME=Datastore |
| 201 | + - INSTALL_LOCK=true |
| 202 | + - DISABLE_SSH=true |
| 203 | + - GITEA_WORK_DIR=/nds-data |
| 204 | + - GITEA__SERVER__APP_DATA_PATH=/nds-data |
| 205 | + - GITEA__DAEMON_USER=git |
| 206 | + - GITEA__HTTP_PORT=3000 |
| 207 | + - GITEA__APP__NAME=datastore |
| 208 | + - GITEA__SERVER__LFS_START_SERVER=true |
| 209 | + - GITEA__LFS__SERVE_DIRECT=true |
| 210 | + - GITEA__LFS__STORAGE_TYPE=local |
| 211 | + - GITEA__LFS_START__SERVER=true |
| 212 | + - GITEA__SECURITY__INSTALL_LOCK=true |
| 213 | + - GITEA__SERVICE__DEFAULT_ALLOW_CREATE_ORGANIZATION=true |
| 214 | + - GITEA__SMTP_ENABLED=false |
| 215 | + # Database |
| 216 | + - GITEA__DATABASE__DB_TYPE=postgres |
| 217 | + - GITEA__DATABASE__HOST=nemo-postgresql:5432 |
| 218 | + - GITEA__DATABASE__NAME=ndsdb |
| 219 | + - GITEA__DATABASE__USER=test_user |
| 220 | + - GITEA__DATABASE__PASSWD=1234 |
| 221 | + - GITEA__DATABASE_SSL_MODE=disable |
| 222 | + volumes: |
| 223 | + - nemo-data-store:/nds-data:rw |
| 224 | + - /etc/timezone:/etc/timezone:ro |
| 225 | + - /etc/localtime:/etc/localtime:ro |
| 226 | + ports: |
| 227 | + - "3000:3000" |
| 228 | + healthcheck: |
| 229 | + test: ["CMD", "curl", "http://localhost:3000/v1/health"] |
| 230 | + interval: 10s |
| 231 | + timeout: 3s |
| 232 | + retries: 3 |
| 233 | + depends_on: |
| 234 | + nemo-postgresql: |
| 235 | + condition: service_healthy |
| 236 | + data-store-volume-init: |
| 237 | + condition: service_completed_successfully |
| 238 | + networks: |
| 239 | + - nemo-ms |
| 240 | + |
| 241 | + # Optional NIM requires additional 1 GPU of at least 40GB /v1/health/ready |
| 242 | + # nim: |
| 243 | + # image: ${NIM_IMAGE:-""} |
| 244 | + # container_name: nim |
| 245 | + # restart: on-failure |
| 246 | + # ports: |
| 247 | + # - 8002:8000 |
| 248 | + # environment: |
| 249 | + # - NGC_API_KEY=${NGC_API_KEY} |
| 250 | + # - NIM_SERVER_PORT=8000 |
| 251 | + # - NIM_SERVED_MODEL_NAME=${NIM_MODEL_ID} |
| 252 | + # - NIM_PEFT_REFRESH_INTERVAL=60 |
| 253 | + # - NIM_MAX_GPU_LORAS=1 |
| 254 | + # - NIM_MAX_CPU_LORAS=16 |
| 255 | + # - NIM_PEFT_SOURCE=http://entity-store:8000 |
| 256 | + # runtime: nvidia |
| 257 | + # volumes: [] |
| 258 | + # # Map a local directory to the cache directory to avoid downloading the model every time |
| 259 | + # # Ensure to set write permissions on the local directory for all users: chmod -R a+w /path/to/directory |
| 260 | + # # Ex: /raid/nim-cache:/opt/nim/.cache. Brev: - /ephemeral/.cache/nim-cache:/opt/nim/.cache |
| 261 | + # networks: |
| 262 | + # - nemo-ms |
| 263 | + # shm_size: 16GB |
| 264 | + # user: root |
| 265 | + # deploy: |
| 266 | + # resources: |
| 267 | + # reservations: |
| 268 | + # devices: |
| 269 | + # - driver: nvidia |
| 270 | + # capabilities: [gpu] |
| 271 | + # count: all |
| 272 | + # healthcheck: |
| 273 | + # test: [ |
| 274 | + # "CMD", |
| 275 | + # "python3", |
| 276 | + # "-c", |
| 277 | + # "import requests, sys; sys.exit(0 if requests.get('http://localhost:8002/v1/health/live').ok else 1)" |
| 278 | + # ] |
| 279 | + # interval: 10s |
| 280 | + # timeout: 3s |
| 281 | + # retries: 20 |
| 282 | + # # allow for 60 seconds to download a model and start up |
| 283 | + # start_period: 60s |
| 284 | + |
| 285 | + |
| 286 | + ### |
| 287 | + # OpenTelemetry Collector (local) |
| 288 | + # adapted from https://jessitron.com/2021/08/11/run-an-opentelemetry-collector-locally-in-docker/ |
| 289 | + # and https://github.com/open-telemetry/opentelemetry-demo/blob/main/docker-compose.yml |
| 290 | + ### |
| 291 | + otel-collector: |
| 292 | + image: otel/opentelemetry-collector-contrib:0.91.0 |
| 293 | + command: ["--config=/etc/otel-collector-config.yaml"] |
| 294 | + volumes: |
| 295 | + - ./config/otel-collector-config.yaml:/etc/otel-collector-config.yaml |
| 296 | + ports: |
| 297 | + - "4317:4317" # OTLP over gRPC receiver |
| 298 | + - "55679:55679" # UI |
| 299 | + networks: |
| 300 | + - nemo-ms |
| 301 | + |
| 302 | +networks: |
| 303 | + nemo-ms: |
| 304 | + driver: bridge |
| 305 | + |
| 306 | +volumes: |
| 307 | + nemo-data-store: |
| 308 | + driver: local |
| 309 | + nemo-postgresql: |
| 310 | + driver: local |
0 commit comments