-
Notifications
You must be signed in to change notification settings - Fork 55
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
421 lines (412 loc) · 17.2 KB
/
Copy pathdocker-compose.yml
File metadata and controls
421 lines (412 loc) · 17.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
services:
backend:
build:
context: .
dockerfile: docker/backend/Dockerfile
# Build-time provenance (#926). scripts/deploy/start.sh exports these
# from the local repo (`git rev-parse HEAD` etc.) before compose builds.
# Unset = Dockerfile defaults to "unknown" so an out-of-band build still
# produces a well-typed /api/version response.
args:
VERSION: ${VERSION:-unknown}
GIT_COMMIT: ${GIT_COMMIT:-unknown}
GIT_COMMIT_SUBJECT: ${GIT_COMMIT_SUBJECT:-unknown}
GIT_COMMIT_TIMESTAMP: ${GIT_COMMIT_TIMESTAMP:-unknown}
GIT_BRANCH: ${GIT_BRANCH:-unknown}
BUILD_DATE: ${BUILD_DATE:-unknown}
container_name: trinity-backend
ports:
- "8000:8000"
environment:
# SECURITY: Set these in your .env file, not here
- SECRET_KEY=${SECRET_KEY:-} # Required - generate with: openssl rand -hex 32
- ADMIN_PASSWORD=${ADMIN_PASSWORD:-} # Required - set a strong password
- ADMIN_USERNAME=${ADMIN_USERNAME:-admin}
- GOOGLE_CLIENT_ID=${GOOGLE_CLIENT_ID:-}
- GOOGLE_CLIENT_SECRET=${GOOGLE_CLIENT_SECRET:-}
# REDIS_URL embeds the `backend` ACL user — see docker-compose redis service.
- "REDIS_URL=redis://backend:${REDIS_BACKEND_PASSWORD:?REDIS_BACKEND_PASSWORD must be set in .env}@redis:6379"
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GOOGLE_API_KEY=${GOOGLE_API_KEY:-} # For Gemini-powered agents
- GEMINI_API_KEY=${GEMINI_API_KEY:-} # For platform image generation (IMG-001)
# Gemini model overrides (#1130) — non-empty defaults so an unset var doesn't inject "" (#1076)
- GEMINI_TEXT_MODEL=${GEMINI_TEXT_MODEL:-gemini-3.5-flash} # image-gen prompt refinement
- GEMINI_TRANSCRIPTION_MODEL=${GEMINI_TRANSCRIPTION_MODEL:-gemini-3.5-flash} # Telegram voice transcription
- VOICE_MODEL=${VOICE_MODEL:-models/gemini-3.1-flash-live-preview} # Gemini Live model override (#1076: non-empty default so an unset var doesn't inject "")
- VOICE_ENABLED=${VOICE_ENABLED:-true} # Voice mode (default on when GEMINI_API_KEY set)
- WORKSPACE_ENABLED=${WORKSPACE_ENABLED:-false} # Voice Workspace canvas — opt-in BETA (#860)
# VoIP telephony (VOIP-001, #1056) — opt-in, default OFF. Master switch must
# reach the container or voip_available stays false (read by config.py).
- VOIP_ENABLED=${VOIP_ENABLED:-false}
- VOIP_MAX_CALL_DURATION=${VOIP_MAX_CALL_DURATION:-600} # per-call cap (s)
- VOIP_DEFAULT_DAILY_CALL_CAP=${VOIP_DEFAULT_DAILY_CALL_CAP:-50} # per-agent/day spend bound
- VOIP_CALL_RATE_LIMIT=${VOIP_CALL_RATE_LIMIT:-5} # calls per owner+destination window
- VOIP_CALL_RATE_WINDOW=${VOIP_CALL_RATE_WINDOW:-60} # rate-limit window (s)
- VOIP_TICKET_TTL_SECONDS=${VOIP_TICKET_TTL_SECONDS:-180} # Media Streams WSS ticket TTL
- VOIP_INTENT_TTL_SECONDS=${VOIP_INTENT_TTL_SECONDS:-180} # staged Gemini-intent TTL
- GITHUB_PAT=${GITHUB_PAT:-}
- HOST_TEMPLATES_PATH=${PWD}/config/agent-templates
# OpenTelemetry Configuration (Optional)
- OTEL_ENABLED=${OTEL_ENABLED:-0}
- OTEL_COLLECTOR_ENDPOINT=${OTEL_COLLECTOR_ENDPOINT:-http://trinity-otel-collector:4317}
- OTEL_METRICS_EXPORTER=${OTEL_METRICS_EXPORTER:-otlp}
- OTEL_LOGS_EXPORTER=${OTEL_LOGS_EXPORTER:-otlp}
- OTEL_EXPORTER_OTLP_PROTOCOL=${OTEL_EXPORTER_OTLP_PROTOCOL:-grpc}
- OTEL_METRIC_EXPORT_INTERVAL=${OTEL_METRIC_EXPORT_INTERVAL:-60000}
# Email Service Configuration (for public link verification)
- EMAIL_PROVIDER=${EMAIL_PROVIDER:-resend}
- RESEND_API_KEY=${RESEND_API_KEY:-}
- SMTP_FROM=${SMTP_FROM:-noreply@trinity.example.com}
# SMTP transport (EMAIL_PROVIDER=smtp) — read by src/backend/config.py (#771)
- SMTP_HOST=${SMTP_HOST:-}
- SMTP_PORT=${SMTP_PORT:-587}
- SMTP_USER=${SMTP_USER:-}
- SMTP_PASSWORD=${SMTP_PASSWORD:-}
# SendGrid transport (EMAIL_PROVIDER=sendgrid) — read by src/backend/config.py (#771)
- SENDGRID_API_KEY=${SENDGRID_API_KEY:-}
# Log Retention & Archival Configuration
- LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-90}
- LOG_ARCHIVE_ENABLED=${LOG_ARCHIVE_ENABLED:-true}
- LOG_CLEANUP_HOUR=${LOG_CLEANUP_HOUR:-3}
- LOG_ARCHIVE_PATH=/data/archives # Local path for archived logs
- TRINITY_DB_PATH=/data/trinity.db
# Credential Encryption
- CREDENTIAL_ENCRYPTION_KEY=${CREDENTIAL_ENCRYPTION_KEY:-}
# Internal API shared secret (C-003) - for scheduler/agent communication
- INTERNAL_API_SECRET=${INTERNAL_API_SECRET:-}
# Public access / OAuth callbacks
- PUBLIC_CHAT_URL=${PUBLIC_CHAT_URL:-}
- FRONTEND_URL=${FRONTEND_URL:-}
# CORS (comma-separated extra origins)
- EXTRA_CORS_ORIGINS=${EXTRA_CORS_ORIGINS:-}
# Slack channel adapter
- SLACK_SIGNING_SECRET=${SLACK_SIGNING_SECRET:-}
# SSH access host override
- SSH_HOST=${SSH_HOST:-}
# Canary invariant harness (CANARY-001 / Issue #411).
# When 1, services/canary_service.py runs the 5-min watcher loop on
# staging/dev. Default 0 — production users see no canary activity.
- CANARY_ENABLED=${CANARY_ENABLED:-0}
# Slack alert sink for canary green→red transitions. URL is the
# credential — leaking it lets anyone post to that one channel.
# Unset = canary cycles run silently (still persists violations).
- CANARY_SLACK_WEBHOOK_URL=${CANARY_SLACK_WEBHOOK_URL:-}
# Backend agent-call budget (#904 RC-1). Caps how many concurrent
# outbound agent HTTP calls the backend will hold at once, plus
# the per-acquire queue wait before returning 503. Default
# timeout (3600s) matches the platform max execution_timeout —
# any call that would have eventually succeeded pre-#904 still
# succeeds; the cap exists to break deadlocks on deep
# agent-to-agent chains (chain depth > global cap), not to fail
# short-tail calls. Set queue timeout to 0 to disable (wait
# forever — accepts deadlock risk for zero false 503s).
- BACKEND_AGENT_CALL_LIMIT=${BACKEND_AGENT_CALL_LIMIT:-8}
- BACKEND_AGENT_CALL_QUEUE_TIMEOUT_S=${BACKEND_AGENT_CALL_QUEUE_TIMEOUT_S:-3600}
# Issue #874: uvicorn --reload writes __pycache__/ next to source files
# in the ./src/backend bind mount. On Linux dev hosts the host dir is
# owned by the developer UID, not container UID 1000, so the writes
# fail. macOS Docker Desktop masks the failure; set this for parity.
- PYTHONDONTWRITEBYTECODE=1
# #847 — force OSS-only mode even when the enterprise submodule
# is mounted. With this set, EntitlementService.is_entitled()
# returns False for every feature, and the /api/settings/feature-flags
# `enterprise_features` list is empty. Default (unset = 0): allow
# entitlements based on license (Phase 0 stub: all-entitled).
- TRINITY_OSS_ONLY=${TRINITY_OSS_ONLY:-0}
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./src/backend:/app
- ./VERSION:/app/VERSION:ro # Platform version for agent container labels
- ./config/agent-templates:/agent-configs/templates:ro
- ./config/process-templates:/agent-configs/process-templates:ro # Process templates
- ./config/hooks:/config/hooks:ro # Read-only hooks (e.g., read-only-guard.py)
- ./config/process-docs:/app/config/process-docs:ro # Process documentation
- agent-configs:/agent-configs
- trinity-data:/data
- trinity-logs:/data/logs:ro # Read-only access to Vector logs
- trinity-archives:/data/archives # Log archives storage
depends_on:
redis:
condition: service_healthy # Wait for ACL load + auth-aware healthcheck
vector:
condition: service_healthy # Ensure Vector is ready before backend starts agents
networks:
- trinity-platform # Redis, scheduler, vector, OTel
- trinity-agent # Frontend, mcp-server, agent containers
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
# No cap_add: backend listens on 8000 (>=1024, no NET_BIND_SERVICE needed)
# and reaches /var/run/docker.sock via group_add below — not capabilities.
# Issue #874: backend runs as UID 1000 (non-root) but still needs to talk
# to /var/run/docker.sock. On Linux the socket is group-owned by `docker`
# (typically GID 999 on Debian/Ubuntu, ~990 on RHEL family). macOS Docker
# Desktop ignores group_add; the default is safe there.
group_add:
- "${DOCKER_GID:-999}"
read_only: false
tmpfs:
- /tmp:noexec,nosuid,size=100m
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload --no-server-header
mem_limit: 4g
cpus: 4
frontend:
build:
context: ./src/frontend
dockerfile: ../../docker/frontend/Dockerfile
image: trinity-frontend:latest
container_name: trinity-frontend
ports:
- "${FRONTEND_PORT:-80}:80"
volumes:
- ./src/frontend:/app
- /app/node_modules
environment:
- VITE_API_URL=http://localhost:8000
- DOCKER_ENV=true
depends_on:
- backend
networks:
- trinity-agent # Frontend talks to backend on agent network
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
mem_limit: 2g
cpus: 2
redis:
image: redis:7-alpine
container_name: trinity-redis
# Loopback-only host port (test suites connect from the dev machine).
# External LAN cannot reach Redis even with auth — defense in depth.
ports:
- "127.0.0.1:6379:6379"
volumes:
- redis-data:/data
networks:
- trinity-platform # Issue #589: Redis is NOT on agent network
environment:
# Two passwords by design: leaking the runtime password from a
# compromised platform container does NOT grant admin (FLUSHALL,
# CONFIG, SHUTDOWN). See docs/migrations/REDIS_AUTH.md.
- "REDIS_PASSWORD=${REDIS_PASSWORD:?REDIS_PASSWORD must be set in .env (generate openssl rand -hex 24)}"
- "REDIS_BACKEND_PASSWORD=${REDIS_BACKEND_PASSWORD:?REDIS_BACKEND_PASSWORD must be set in .env (generate openssl rand -hex 24)}"
# Additive ACL — start from zero, allow only what the runtime needs.
# `+@all -X` is rejected: it includes commands Redis hasn't shipped yet.
# NB: each `--user` rule must be its own argv (Redis CLI parser).
command:
- redis-server
- --appendonly
- "yes"
- --requirepass
- ${REDIS_PASSWORD}
# default — admin user, full access (used for ACL bootstrap and ad-hoc ops)
- --user
- default
- "on"
- ">${REDIS_PASSWORD}"
- "~*"
- "&*"
- "+@all"
# backend — runtime user for backend container
- --user
- backend
- "on"
- ">${REDIS_BACKEND_PASSWORD}"
- "~*"
- "&*"
- "+@read"
- "+@write"
- "+@connection"
- "+@keyspace"
- "+@stream"
- "+@list"
- "+@set"
- "+@sortedset"
- "+@hash"
- "+@scripting"
- "+@transaction"
- "+@pubsub"
- "-@dangerous"
# scheduler — same access pattern as backend; separate user for traceability
- --user
- scheduler
- "on"
- ">${REDIS_BACKEND_PASSWORD}"
- "~*"
- "&*"
- "+@read"
- "+@write"
- "+@connection"
- "+@keyspace"
- "+@stream"
- "+@list"
- "+@set"
- "+@sortedset"
- "+@hash"
- "+@scripting"
- "+@transaction"
- "+@pubsub"
- "-@dangerous"
# Healthcheck pings as the `backend` ACL user so a typo'd ACL keeps
# Redis unhealthy and gates dependent services (broken-ACL caught at
# boot, not after first user-facing failure).
# Password passed via REDISCLI_AUTH env var (not -a flag) so it does
# not appear in /proc/<pid>/cmdline during the healthcheck (CSO OBS-3).
healthcheck:
test:
- CMD-SHELL
- REDISCLI_AUTH="$$REDIS_BACKEND_PASSWORD" redis-cli --user backend --no-auth-warning ping | grep -q PONG
interval: 10s
timeout: 5s
retries: 5
start_period: 5s
mcp-server:
build:
context: ./src/mcp-server
dockerfile: Dockerfile
container_name: trinity-mcp-server
ports:
- "8080:8080"
environment:
- NODE_ENV=production
- MCP_PORT=8080
- TRINITY_API_URL=http://backend:8000
- TRINITY_USERNAME=${TRINITY_USERNAME:-admin}
# #914: ceiling for the synchronous backend fetch in chat_with_agent
# sync mode. On abort, the MCP server queries /executions and returns
# a structured `queued_timeout` receipt instead of `fetch failed`.
# Default 25000ms sits below the typical 30-60s MCP gateway ceiling.
- MCP_CHAT_TIMEOUT_MS=${MCP_CHAT_TIMEOUT_MS:-25000}
# MCP server authenticates to backend as ADMIN_USERNAME/ADMIN_PASSWORD.
# Read directly from ADMIN_PASSWORD so devs do not have to set the same
# secret twice (issue #692).
- TRINITY_PASSWORD=${ADMIN_PASSWORD:-}
- MCP_REQUIRE_API_KEY=true
- INTERNAL_API_SECRET=${INTERNAL_API_SECRET:-} # SEC-001: for audit logging to backend
depends_on:
- backend
networks:
# MCP server straddles both networks: agents call http://mcp-server:8080/mcp
# via Docker DNS on the agent network; backend reaches it on platform network.
- trinity-platform
- trinity-agent
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE
restart: unless-stopped
# Scheduler - dedicated service for cron-based agent task execution
# Prevents duplicate executions that occurred with multiple backend workers
scheduler:
build:
context: .
dockerfile: docker/scheduler/Dockerfile
container_name: trinity-scheduler
restart: unless-stopped
environment:
- DATABASE_PATH=/data/trinity.db
# Scheduler uses the same `backend` ACL user (identical access patterns).
- "REDIS_URL=redis://backend:${REDIS_BACKEND_PASSWORD:?REDIS_BACKEND_PASSWORD must be set in .env}@redis:6379"
- HEALTH_PORT=8001
- LOG_LEVEL=${SCHEDULER_LOG_LEVEL:-INFO}
- LOCK_TIMEOUT=600
- AGENT_TIMEOUT=900
- MISFIRE_GRACE_TIME=3600
- PUBLISH_EVENTS=true
- INTERNAL_API_SECRET=${INTERNAL_API_SECRET:-}
volumes:
- trinity-data:/data
depends_on:
redis:
condition: service_healthy # Wait for ACL load + auth-aware healthcheck
backend:
condition: service_started
networks:
- trinity-platform # Scheduler talks only to Redis + backend; never to agents
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
mem_limit: 512m
cpus: 1
# Vector - centralized log aggregation from all containers
vector:
image: timberio/vector:0.43.1-alpine
container_name: trinity-vector
restart: unless-stopped
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./config/vector.yaml:/etc/vector/vector.yaml:ro
- trinity-logs:/data/logs
ports:
- "8686:8686" # Vector API (health checks)
networks:
- trinity-platform # Vector reads via Docker socket, not network — platform side
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
healthcheck:
test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:8686/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s # Give Vector time to start before health checks
# OpenTelemetry Collector - receives metrics from Claude Code agents
# Using contrib image for deltatocumulative processor (required for Prometheus export)
otel-collector:
image: otel/opentelemetry-collector-contrib:0.120.0
container_name: trinity-otel-collector
restart: unless-stopped
ports:
- "4317:4317" # gRPC receiver (OTLP)
- "4318:4318" # HTTP receiver (OTLP)
- "8889:8889" # Prometheus metrics exporter
- "13133:13133" # Health check endpoint
volumes:
- ./config/otel-collector.yaml:/etc/otelcol-contrib/config.yaml:ro
networks:
- trinity-platform
- trinity-agent # Agents push metrics directly to the collector
labels:
- "trinity.platform=infrastructure"
- "trinity.service=otel-collector"
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
volumes:
agent-configs:
redis-data:
trinity-data:
trinity-logs: # Vector log storage
trinity-archives: # Compressed log archives
networks:
# Platform-internal network: Redis, scheduler, backend, mcp-server, vector, otel.
# Agents NEVER join this network — Issue #589 (Redis lockdown).
trinity-platform:
driver: bridge
name: trinity-platform-network
ipam:
config:
- subnet: 172.29.0.0/16
# Agent-facing network: agents, frontend, plus the bridges that need to reach
# them (backend, mcp-server, otel-collector). Name preserved so the three
# agent-creation sites in services/agent_service/* and system_agent_service.py
# need zero code changes.
trinity-agent:
driver: bridge
name: trinity-agent-network
ipam:
config:
- subnet: 172.28.0.0/16