diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index d4a2e3c4..941deb52 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -40,6 +40,10 @@ services: - SLACK_CLIENT_ID=${SLACK_CLIENT_ID} - SLACK_CLIENT_SECRET=${SLACK_CLIENT_SECRET} - SLACK_SIGNING_SECRET=${SLACK_SIGNING_SECRET:-} + # Socket Mode concurrent WebSocket count (#244); clamped to [1,10], default 2. + # Mirrors docker-compose.yml — without this line the .env knob is inert in + # prod (the #1039 packaging-gap class). + - SLACK_SOCKET_CONNECTION_COUNT=${SLACK_SOCKET_CONNECTION_COUNT:-2} - CREDENTIAL_ENCRYPTION_KEY=${CREDENTIAL_ENCRYPTION_KEY:-} - EMAIL_PROVIDER=${EMAIL_PROVIDER:-resend} - RESEND_API_KEY=${RESEND_API_KEY:-} @@ -93,6 +97,14 @@ services: - PUBLIC_CHAT_URL=${PUBLIC_CHAT_URL:-} # Frontend URL for post-OAuth redirects - FRONTEND_URL=${FRONTEND_URL:-} + # Backend's own public origin — used to build OAuth redirect URIs + # ({BACKEND_URL}/api/oauth/{provider}/callback, read by config.py). Mirrors + # docker-compose.yml; without this line the .env value is inert in prod (the + # #1039 packaging-gap class) and OAuth callbacks fall back to localhost — set + # this to your public backend URL. The scheduler reads BACKEND_URL too but + # keeps its own internal default (http://backend:8000); intentionally not + # wired here so it still reaches the backend over the Docker network. + - BACKEND_URL=${BACKEND_URL:-http://localhost:8000} # Internal API shared secret (C-003) - for scheduler/agent communication - INTERNAL_API_SECRET=${INTERNAL_API_SECRET} # Configurable database backend (#300, experimental) — mirrors @@ -103,6 +115,12 @@ services: - DATABASE_URL=${DATABASE_URL:-} - DB_POOL_SIZE=${DB_POOL_SIZE:-10} - DB_MAX_OVERFLOW=${DB_MAX_OVERFLOW:-20} + # Host telemetry (/api/telemetry) — container-stats cache freshness (s) and + # max concurrent Docker stat fetches per refresh. Mirrors docker-compose.yml; + # without these lines the .env knobs are inert in prod (the #1039 + # packaging-gap class). + - TELEMETRY_CONTAINER_STATS_TTL=${TELEMETRY_CONTAINER_STATS_TTL:-10} + - TELEMETRY_DOCKER_POOL_SIZE=${TELEMETRY_DOCKER_POOL_SIZE:-16} volumes: - /var/run/docker.sock:/var/run/docker.sock:ro - ./config/agent-templates:/agent-configs/templates:ro @@ -279,6 +297,9 @@ services: - LOG_LEVEL=${SCHEDULER_LOG_LEVEL:-INFO} - LOCK_TIMEOUT=600 - AGENT_TIMEOUT=900 + # #1022: scheduler→backend dispatch + pre-check deadlines (operator-tunable). + - DISPATCH_TIMEOUT=${DISPATCH_TIMEOUT:-30} + - PRE_CHECK_TIMEOUT=${PRE_CHECK_TIMEOUT:-70} - MISFIRE_GRACE_TIME=3600 - PUBLISH_EVENTS=true - INTERNAL_API_SECRET=${INTERNAL_API_SECRET} diff --git a/docker-compose.yml b/docker-compose.yml index 5258af7b..9c6a26b7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -83,12 +83,26 @@ services: # Public access / OAuth callbacks - PUBLIC_CHAT_URL=${PUBLIC_CHAT_URL:-} - FRONTEND_URL=${FRONTEND_URL:-} + # Backend's own public origin — used to build OAuth redirect URIs + # ({BACKEND_URL}/api/oauth/{provider}/callback, read by config.py). Without + # this line the .env value is inert (the #1039 packaging-gap class) and + # callbacks fall back to the localhost default. The scheduler reads BACKEND_URL + # too but keeps its own internal default (http://backend:8000); intentionally + # not wired here so it still reaches the backend over the Docker network. + - BACKEND_URL=${BACKEND_URL:-http://localhost:8000} # CORS (comma-separated extra origins) - EXTRA_CORS_ORIGINS=${EXTRA_CORS_ORIGINS:-} # Slack channel adapter - SLACK_SIGNING_SECRET=${SLACK_SIGNING_SECRET:-} + # Socket Mode concurrent WebSocket count (#244); clamped to [1,10], default 2 + - SLACK_SOCKET_CONNECTION_COUNT=${SLACK_SOCKET_CONNECTION_COUNT:-2} # SSH access host override - SSH_HOST=${SSH_HOST:-} + # Host telemetry (/api/telemetry) — container-stats cache freshness (s) and + # max concurrent Docker stat fetches per refresh. Without these lines the + # .env knobs are inert (the #1039 packaging-gap class). + - TELEMETRY_CONTAINER_STATS_TTL=${TELEMETRY_CONTAINER_STATS_TTL:-10} + - TELEMETRY_DOCKER_POOL_SIZE=${TELEMETRY_DOCKER_POOL_SIZE:-16} # Canary invariant harness (CANARY-001 / Issue #411). # When 1, services/canary_service.py runs the 5-min watcher loop on # staging/dev. Default 0 — production users see no canary activity. @@ -328,6 +342,9 @@ services: - LOG_LEVEL=${SCHEDULER_LOG_LEVEL:-INFO} - LOCK_TIMEOUT=600 - AGENT_TIMEOUT=900 + # #1022: scheduler→backend dispatch + pre-check deadlines (operator-tunable). + - DISPATCH_TIMEOUT=${DISPATCH_TIMEOUT:-30} + - PRE_CHECK_TIMEOUT=${PRE_CHECK_TIMEOUT:-70} - MISFIRE_GRACE_TIME=3600 - PUBLISH_EVENTS=true - INTERNAL_API_SECRET=${INTERNAL_API_SECRET:-} diff --git a/docs/memory/feature-flows.md b/docs/memory/feature-flows.md index f25d1fe9..27c8b2d4 100644 --- a/docs/memory/feature-flows.md +++ b/docs/memory/feature-flows.md @@ -11,6 +11,7 @@ | Date | ID | Feature | Flow | |------|-----|---------|------| +| 2026-06-14 | #1022 | fix(scheduler): persist a descriptive `error` on dispatch timeout — a dispatch `httpx.TimeoutException` (whose `str()` is `''`) previously landed in the cron path's generic handler and persisted a **blank** `error`. Now re-raised before that handler as a named non-blank message (`"dispatch to /api/internal/execute-task timed out after {N}s — outcome unknown"`); outcome is genuinely UNKNOWN (backend spawns the bg task before replying → may already be running → orphan recovered by cleanup). New `_describe_exception()` helper (type-name fallback) normalizes any blank-stringifying exception across all execution/retry/process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Scheduler-only (`src/scheduler/`); +270 lines of tests (incl. pre-check config-deadline + retry-path blank-error regressions). | [scheduler-service.md](feature-flows/scheduler-service.md), [scheduler-pre-check.md](feature-flows/scheduler-pre-check.md) | | 2026-06-11 | #858 | fix: first-time setup token silently lost — `docker/backend/Dockerfile` had drifted and lost `ENV PYTHONUNBUFFERED=1` (which `docker/scheduler/Dockerfile` still set), so CPython block-buffered the lifespan's stdout to the Docker log pipe (~8KB) and the printed setup token never reached `docker logs`, deadlocking fresh installs (the only documented path through the `routers/setup.py` token gate). Two-layer fix: (1) restore `PYTHONUNBUFFERED=1` (catches every `print()`); (2) the setup-token block + ~76 other lifespan `print()` calls now emit via the structured `logger` — the token as a single multi-line `logger.warning` **relocated to immediately after `setup_logging()`**, before the event-bus/audit-write startup that could otherwise hang and suppress it (the `StreamHandler` flushes per record, so it's immune to future Dockerfile drift and flows through Vector). `setup_opentelemetry()`'s import-time print + the `register_enterprise` prints stay `print(..., flush=True)` (they run before `setup_logging()`). New `unit/test_858_dockerfile_unbuffered.py` backend↔scheduler parity guard (2 tests). Note: stdout→stderr stream move for the converted lines (Docker/Vector capture both). Known follow-up #1165: prod runs uvicorn `--workers 2`, so the per-process token is still ~50% flaky until unified. | [first-time-setup.md](feature-flows/first-time-setup.md) | | 2026-06-10 | #1130 | fix: retired `gemini-2.0-flash` replaced with env-configurable models — `GEMINI_TEXT_MODEL` (image-gen prompt refinement) + `GEMINI_TRANSCRIPTION_MODEL` (Telegram voice), both default `gemini-3.5-flash`, defined in `config.py`, empty-string-safe wiring in both compose files (#1076 pattern). | [image-generation.md](feature-flows/image-generation.md), [telegram-integration.md](feature-flows/telegram-integration.md) | | 2026-06-10 | #1108 | feat(ui): Agent Detail **Guardrails** tab renamed to **Settings** — sectioned config home. New `components/settings/SettingsPanel.vue` renders `GuardrailsPanel` unchanged as section #1; future per-agent settings land as additive sections, not new tabs. `?tab=guardrails` deep links alias to `settings` via `TAB_ALIASES`. Pure frontend. | [agent-guardrails.md](feature-flows/agent-guardrails.md) | diff --git a/docs/memory/feature-flows/scheduler-pre-check.md b/docs/memory/feature-flows/scheduler-pre-check.md index f941e20e..846bcfb7 100644 --- a/docs/memory/feature-flows/scheduler-pre-check.md +++ b/docs/memory/feature-flows/scheduler-pre-check.md @@ -33,7 +33,7 @@ Returns: ## Scheduler Layer **Service**: `src/scheduler/service.py` — `_run_pre_check(agent_name)` -Calls the backend's internal endpoint (not the agent directly — topology stays "scheduler → backend → agent"). Translates the backend response into a scheduler decision: +Calls the backend's internal endpoint (not the agent directly — topology stays "scheduler → backend → agent"). The scheduler→backend HTTP call uses `config.pre_check_timeout` (env `PRE_CHECK_TIMEOUT`, default 70s — the agent-side hook deadline is 60s, this adds headroom; lifted from a literal in #1022). A timeout fails open (`None` → fire as usual) and is logged with the exception **type** rather than the empty parens a bare `httpx` timeout would otherwise print (#1022). Translates the backend response into a scheduler decision: | Backend response | Scheduler decision | |---|---| diff --git a/docs/memory/feature-flows/scheduler-service.md b/docs/memory/feature-flows/scheduler-service.md index fdd9b250..4a715942 100644 --- a/docs/memory/feature-flows/scheduler-service.md +++ b/docs/memory/feature-flows/scheduler-service.md @@ -107,6 +107,8 @@ As a **platform administrator**, I want **scheduled tasks to execute exactly onc | `SCHEDULE_RELOAD_INTERVAL` | `60` | Seconds between schedule sync checks | | `AGENT_TIMEOUT` | `900` | Default agent request timeout (15 min) | | `POLL_INTERVAL` | `10` | Seconds between DB polls for async task completion (SCHED-ASYNC-001) | +| `DISPATCH_TIMEOUT` | `30` | HTTP deadline for the scheduler→backend `POST /api/internal/execute-task` round-trip (dispatch only; the async endpoint returns ~instantly). Reaching it means the backend did not respond — outcome is **UNKNOWN**, not "rejected" (#1022) | +| `PRE_CHECK_TIMEOUT` | `70` | HTTP deadline for the scheduler→backend pre-check call (agent-side hook is 60s; 10s headroom). Fail-open (#1022) | | `MISFIRE_GRACE_TIME` | `3600` | Seconds after a missed trigger that APScheduler will still execute (Issue #145) | | `BACKEND_URL` | `http://backend:8000` | Backend API URL for process executions and task delegation | | `INTERNAL_API_SECRET` | _(empty)_ | Shared secret for backend internal API auth (C-003) | @@ -293,13 +295,15 @@ async def _execute_schedule_with_lock(self, schedule_id: str, triggered_by: str self.db.update_execution_status(execution_id=execution.id, status=ExecutionStatus.FAILED, ...) ``` -**Backend Task Delegation** (`service.py:760-833`): +**Backend Task Delegation** (`_call_backend_execute_task`): Uses async fire-and-forget dispatch (SCHED-ASYNC-001): -1. `POST /api/internal/execute-task` with `async_mode=True` and 30s HTTP timeout +1. `POST /api/internal/execute-task` with `async_mode=True` and the configured dispatch deadline (`config.dispatch_timeout`, env `DISPATCH_TIMEOUT`, default 30s — was a literal before #1022) 2. If backend accepts (`{"status": "accepted", "async_mode": true}`), poll DB 3. Backward compatible: if backend returns sync result, use directly +A dispatch `httpx.TimeoutException` is re-raised as a **named, non-blank** error (`"dispatch to /api/internal/execute-task timed out after {N}s ({TimeoutType}) — outcome unknown"`) *before* it can reach the generic `except` handler. The outcome is genuinely unknown: the backend spawns the background task before replying (`internal.py`), so a dispatch timeout may mean the task is already running and will surface as an orphan recovered by the cleanup service. This closes the #1022 silent-failure — bare `httpx` timeouts stringify to `''`, which previously persisted as a blank `error`. Defense-in-depth: `_describe_exception()` normalizes any other blank-stringifying exception (falls back to the type name) across every execution, retry, and process-schedule error path. + **DB Polling** (`service.py:835-887`): ```python async def _poll_execution_completion(self, execution_id, timeout_seconds): @@ -954,7 +958,7 @@ typing-extensions>=4.9.0 | `test_locking.py` | Distributed locks | Redis lock acquire/release | | `test_agent_client.py` | HTTP client | Agent communication | | `test_service.py` | Scheduler service | Full integration tests | -| `test_async_dispatch.py` | Async dispatch + polling | SCHED-ASYNC-001 (11 tests) | +| `test_async_dispatch.py` | Async dispatch + polling | SCHED-ASYNC-001 + dispatch-timeout error persistence (#1022): `_describe_exception()` branches, descriptive-raise on `ReadTimeout('')`, config-driven deadline, non-empty-error regression for both the main dispatch and the retry path (22 tests) | | `conftest.py` | Fixtures | Mock database, Redis, models | ### Running Tests @@ -985,6 +989,8 @@ docker compose -f docker/scheduler/docker-compose.test.yml up | Agent timeout | Update status=failed (with overwrite guard), publish event | Error recorded | | Auth failure detected | Log auth-specific error, publish event | Error recorded with auth context | | **TCP disconnect (SCHED-ASYNC-001)** | **Check DB before overwriting -- if backend already finalized, preserve status** | **No false failures** | +| **Dispatch timeout (#1022)** | Re-raised as a named non-blank error *before* the generic handler; outcome UNKNOWN (task may already be running → orphan) | Descriptive `error` persisted (never blank); cleanup service recovers any orphan | +| **Blank-stringifying exception (e.g. httpx timeout)** | `_describe_exception()` falls back to the exception type name | `error` never persisted blank (#1022) | | **Polling deadline exceeded** | Raise exception, overwrite guard checks DB status | Error recorded (if genuinely stale) | | Process backend HTTP error | Update process execution as failed, publish event | Error recorded | | Process backend timeout | Update process execution as failed, publish event | Error recorded | @@ -1078,6 +1084,7 @@ The embedded scheduler (`src/backend/services/scheduler_service.py`) has been co | Date | Change | |------|--------| +| 2026-06-14 | **Descriptive error on dispatch timeout (#1022)**: a dispatch `httpx.TimeoutException` is re-raised with a named, non-blank message (`"dispatch ... timed out after {N}s — outcome unknown"`) so the silent blank `error` (bare httpx timeouts `str()` to `''`) no longer lands. New `_describe_exception()` helper normalizes blank exceptions across all execution / retry / process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Outcome of a dispatch timeout is UNKNOWN (backend may have already started the bg task → orphan recovered by cleanup). | | 2026-05-09 | **MCP update_agent_schedule fixes (aaad4f6, #741/#742)**: (1) `src/mcp-server/src/tools/schedules.ts` — added explicit warning to `enabled` field description in `update_agent_schedule` so AI models do not inadvertently re-enable a disabled schedule when updating unrelated fields (e.g. cron expression). (2) `src/backend/routers/schedules.py` — added `max_retries` and `retry_delay_seconds` to `ScheduleUpdateRequest`; both were missing, so `exclude_unset=True` silently dropped them before they reached `db.update_schedule()`, making retry config uneditable via MCP. | | 2026-04-23 | **Retry Default Flipped (#476)**: `max_retries` default `1 → 0`. Both new and existing schedules are opt-in now. Scheduled agents typically catch up on next tick; retries amplified load during multi-hour outages. | | 2026-04-14 | **Automatic Retry (RETRY-001)**: Added Flow 10 documenting configurable retry mechanism for failed executions. New fields: max_retries, retry_delay_seconds, attempt_number, retry_of_execution_id, retry_scheduled_at. New status: pending_retry. | diff --git a/docs/reports/config-2026-06-14.md b/docs/reports/config-2026-06-14.md new file mode 100644 index 00000000..0a973d98 --- /dev/null +++ b/docs/reports/config-2026-06-14.md @@ -0,0 +1,86 @@ +# Config Validation Report + +**Date**: 2026-06-14 +**Branch**: AndriiPasternak31/plan-issue-1022 +**Sources checked**: `.env.example`, `docker-compose.yml`, `docker-compose.prod.yml`, `src/backend/`, `src/scheduler/`, `src/mcp-server/` + +## Summary + +| Check | Status | Count | +|-------|--------|-------| +| Undocumented **required** vars (no default, secret/wiring) | ✅ | 0 | +| Documented in `.env.example` but **not wired into any container** | ✅ resolved | 0 (was 4) | +| Compose-tunable knobs (all `:-default`) absent from `.env.example` | ⚠️ | ~18 | +| Code-only knobs (code default) absent from `.env.example` | ⚠️ | ~24 | +| Truly stale `.env.example` entries | ✅ | 0 | +| False positives excluded (build-args / internal wiring) | — | 13 | + +**Headline**: No security- or boot-critical drift. Every compose `${VAR}` reference carries an inline `:-default`, and the two mandatory secrets use loud `:?` guards (`REDIS_BACKEND_PASSWORD:?...`) — nothing fails silently. The one **functional** finding is a wiring gap: compose declares no `env_file:`, so four vars documented in `.env.example` never reach the backend container. + +--- + +## Findings + +### ✅ RESOLVED (2026-06-14) — Documented vars not forwarded to any container (functional drift) + +**Fix:** wired all four vars into the **backend** service `environment:` block in both `docker-compose.yml` and `docker-compose.prod.yml`, each with a `:-` that matches both the code default and the `.env.example` value (behavior unchanged when unset). `BACKEND_URL` was wired into the backend **only** — the scheduler also reads it but with a different internal default (`http://backend:8000`), so injecting the backend's localhost default there would have broken the scheduler→backend hop. Both compose files validate (`docker compose config -q` → exit 0). + +--- + +`docker-compose.yml` / `docker-compose.prod.yml` use **no `env_file:` directive** — they only interpolate `${VAR}` for substitution and pass through what each service's `environment:` block explicitly lists. The following were declared in `.env.example` (implying "set me here") and **read by backend `config.py`**, but appeared in **no** compose `environment:` block, so an operator setting them in `.env` had **no effect** on the running container: + +| Variable | Read by | `.env.example` | In compose env block | +|----------|---------|:---:|:---:| +| `BACKEND_URL` | backend config | ✅ | ❌ | +| `SLACK_SOCKET_CONNECTION_COUNT` | slack socket transport (default 2) | ✅ | ❌ | +| `TELEMETRY_CONTAINER_STATS_TTL` | telemetry service | ✅ | ❌ | +| `TELEMETRY_DOCKER_POOL_SIZE` | telemetry service | ✅ | ❌ | + +**Fix** (pick one per var): +- If meant to be operator-tunable → add `- VAR=${VAR:-}` to the `backend` service `environment:` block in both compose files (matches the existing `BACKEND_AGENT_CALL_LIMIT=${...:-8}` pattern). +- If not intended for container override → remove from `.env.example` so the contract isn't misleading. + +Given all four have live code readers, **wiring them through** is the correct fix. + +### INFORMATIONAL — Compose knobs not surfaced in `.env.example` + +Referenced in compose with a safe inline default, so operator-optional — but invisible to anyone reading `.env.example`: + +- **Concurrency / timeouts**: `BACKEND_AGENT_CALL_LIMIT` (8), `BACKEND_AGENT_CALL_QUEUE_TIMEOUT_S` (3600), `DISPATCH_TIMEOUT` (30), `PRE_CHECK_TIMEOUT` (70), `MCP_CHAT_TIMEOUT_MS` (25000), `MCP_REQUIRE_API_KEY` (true) +- **Alt DB backend (#300, defaults to SQLite)**: `DATABASE_URL` (empty), `DB_POOL_SIZE` (10), `DB_MAX_OVERFLOW` (20), `POSTGRES_DB` (trinity), `POSTGRES_USER` (trinity), `POSTGRES_PASSWORD` (**default `trinity`**) +- **Model overrides**: `GEMINI_TEXT_MODEL`, `GEMINI_TRANSCRIPTION_MODEL`, `VOICE_MODEL` +- **Misc**: `SCHEDULER_LOG_LEVEL` (INFO), `TRINITY_OSS_ONLY` (0), `TRINITY_USERNAME` (admin) + +> Note: `POSTGRES_PASSWORD` ships a dev default of `trinity`. The Postgres path is opt-in (`DATABASE_URL` empty → SQLite), but if an operator enables it without overriding, they inherit a weak default with no `.env.example` prompt. Worth a documented `POSTGRES_PASSWORD=` line if the #300 path is supported. + +### INFORMATIONAL — Code-only knobs (no compose ref, no `.env.example`) + +Read by backend with a code default; discoverable only by reading source. Several are documented in `architecture.md` but not in `.env.example`: + +- **Retention / maintenance**: `AUDIT_LOG_RETENTION_DAYS`, `AUDIT_RETENTION_ENABLED`, `AUDIT_RETENTION_HOUR`, `DB_VACUUM_ENABLED`, `DB_VACUUM_HOUR`, `DB_VACUUM_MINUTE`, `LOG_ARCHIVE_PATH` +- **Reliability / feature flags**: `DISPATCH_BREAKER_ENABLED`, `SESSION_TAB_ENABLED`, `WEBHOOK_RATE_LIMIT`, `REDIS_STREAM_MAXLEN` +- **Agent readiness / cache**: `AGENT_READINESS_POLL_INTERVAL_S`, `AGENT_READINESS_TIMEOUT_S`, `AGENT_STATS_CACHE_TTL_SECONDS` +- **SSH access**: `SSH_ACCESS_CLEANUP_INTERVAL`, `SSH_ACCESS_DEFAULT_TTL_HOURS`, `SSH_ACCESS_MAX_TTL_HOURS` +- **OTel**: `OTEL_SAMPLE_RATE`, `OTEL_EXPORTER_OTLP_ENDPOINT`, `OTEL_PROMETHEUS_ENDPOINT` +- **Slack**: `SLACK_APP_TOKEN`, `SLACK_AUTO_VERIFY_EMAIL`, `SLACK_TRANSPORT_MODE` +- **Auth / security**: `EMAIL_AUTH_ENABLED`, `TRUSTED_PROXIES`, `ENVIRONMENT` +- **Voice**: `VOICE_MAX_DURATION` + +These are intentionally code-driven and low-priority; surface the highest-value flags (`DISPATCH_BREAKER_ENABLED`, `SESSION_TAB_ENABLED`, `EMAIL_AUTH_ENABLED`, `TRUSTED_PROXIES`, audit/vacuum schedule) in `.env.example` under a "# Advanced / optional" section if desired. + +### Excluded — false positives (correctly absent from `.env.example`) + +- **Build-time provenance** (Dockerfile `ARG`/`ENV`, set by `start.sh` build args): `BUILD_DATE`, `VERSION`, `GIT_COMMIT`, `GIT_BRANCH`, `GIT_COMMIT_SUBJECT`, `GIT_COMMIT_TIMESTAMP` +- **Internal service wiring** (constructed/set by compose or injected into agent containers at creation, never operator-set): `REDIS_URL` (built from `REDIS_BACKEND_PASSWORD`), `TRINITY_DB_PATH`, `TRINITY_BACKEND_URL`, `TRINITY_MCP_URL`, `SCHEDULER_URL`, `TRINITY_GIT_BASE_URL`, `TRINITY_GIT_API_BASE` +- **Infra vars used by compose interpolation, not Python** (declared in `.env.example`, consumed by compose — *not* stale): `DOCKER_GID`, `FRONTEND_PORT`, `REDIS_BACKEND_PASSWORD`, `REDIS_PASSWORD`, `TRINITY_DATA_PATH`, `TUNNEL_TOKEN` + +--- + +## Recommendation + +**DRIFT DETECTED — documentation, plus one functional wiring gap. No required/critical vars missing.** + +Priority order: +1. ~~**Wire the 4 MEDIUM vars** into the backend `environment:` block in both compose files~~ — **✅ done 2026-06-14** (`BACKEND_URL`, `SLACK_SOCKET_CONNECTION_COUNT`, `TELEMETRY_CONTAINER_STATS_TTL`, `TELEMETRY_DOCKER_POOL_SIZE`). +2. **Document `POSTGRES_PASSWORD`** (and the #300 Postgres path) in `.env.example` so the weak `trinity` default isn't inherited silently. +3. (Optional) Add an "# Advanced / optional" block to `.env.example` for the high-value code-only flags. diff --git a/docs/security-reports/cso-2026-06-13-1022-diff.md b/docs/security-reports/cso-2026-06-13-1022-diff.md new file mode 100644 index 00000000..58bb7e19 --- /dev/null +++ b/docs/security-reports/cso-2026-06-13-1022-diff.md @@ -0,0 +1,73 @@ +# CSO Security Audit + +**Mode**: diff +**Scope**: `AndriiPasternak31/plan-issue-1022` vs `origin/dev` (1 commit, 6 files, +252/−14) +**Date**: 2026-06-13 +**Commit**: `51c18031 fix(scheduler): persist descriptive error on dispatch timeout (#1022)` + +## Summary + +| Category | CRITICAL | HIGH | MEDIUM | LOW | +|----------|:--------:|:----:|:------:|:---:| +| Secrets | 0 | 0 | 0 | 0 | +| Dependencies | 0 | 0 | 0 | 0 | +| Auth Boundaries | 0 | 0 | 0 | 0 | +| Injection | 0 | 0 | 0 | 0 | +| Platform Patterns | 0 | 0 | 0 | 0 | +| Configuration | 0 | 0 | 0 | 0 | + +## Change Overview + +A defensive reliability fix: the scheduler was persisting a **blank** `error` on a +`scheduler→backend` dispatch (or pre-check) timeout because httpx timeout exceptions +`str()` to `''`. The fix adds `_describe_exception()` (falls back to the exception type +name when the message is blank/whitespace), wraps the dispatch POST to raise a +descriptive non-blank error, and lifts two hard-coded timeout literals to +operator-tunable config/env. + +| File | Change | +|------|--------| +| `src/scheduler/config.py` | `dispatch_timeout` (`DISPATCH_TIMEOUT`, default 30s) + `pre_check_timeout` (`PRE_CHECK_TIMEOUT`, default 70s) | +| `src/scheduler/service.py` | `_describe_exception()`; `try/except httpx.TimeoutException` on dispatch POST → descriptive raise; blank-error normalization at 3 persistence sites | +| `docker-compose.yml` / `docker-compose.prod.yml` | Pass new env vars with safe defaults | +| `tests/scheduler_tests/test_async_dispatch.py` | +166 lines (helper, descriptive-raise, config-driven deadline, end-to-end non-empty-error regression) | +| `tests/scheduler_tests/test_config.py` | +15 lines (default + env-override assertions) | + +## Findings + +### CRITICAL +None + +### HIGH +None + +### Evidence (what was checked) + +1. **Secrets** — `+`-line pattern sweep: no API keys/tokens/passwords; no `.env` files + staged. Only credential reference (`INTERNAL_API_SECRET`) is unchanged context. New + env vars are numeric timeouts with safe defaults. +2. **Dependencies** — no new runtime deps; `import httpx` added only in tests (`httpx` + already a scheduler dep). No `requirements.txt`/`package.json` delta. +3. **Auth boundaries** — no routes added/modified; internal scheduler logic around the + existing `/api/internal/execute-task` and `/api/internal/agents/{name}/pre-check` + calls; `X-Internal-Secret` headers untouched. +4. **Injection** — richer `error` strings written only via parameterized SQL + (`UPDATE schedule_executions SET … error = ? …`, bound tuple in `database.py:495`). + No SQL string interpolation; no `subprocess`/`os.system`/`eval`/`exec`; no + path/template/SSRF surface. +5. **Secret-leak via exception text** — `_describe_exception` uses `str(e)` (not + `repr`); dispatch raise interpolates only internal values (timeout float, static + path, `type(e).__name__`). httpx timeout `str()` carries no request headers, so + `INTERNAL_API_SECRET` cannot fold into a persisted error. Non-200 branch persists + `response.text[:500]` (backend body, not headers) — unchanged. +6. **Configuration** — two additive, operator-tunable env vars with secure defaults; no + debug/CORS/cookie/header/permission changes. + +### Non-security note (out of CSO scope) +`DISPATCH_TIMEOUT` / `PRE_CHECK_TIMEOUT` are not in `.env.example` — a config-drift nit +for `/validate-config`, not a vulnerability (defaults baked into both `config.py` and +compose). + +## Recommendation + +**CLEAR** — no security blockers. Safe to merge from a security standpoint. diff --git a/src/scheduler/config.py b/src/scheduler/config.py index 194dd2ce..5a1f83f7 100644 --- a/src/scheduler/config.py +++ b/src/scheduler/config.py @@ -75,6 +75,26 @@ class SchedulerConfig: "POLL_DEADLINE_BUFFER", "60" ))) + # Dispatch deadline for the scheduler→backend POST /api/internal/execute-task + # round-trip. The async endpoint normally returns ~instantly; reaching this + # ceiling means the backend did not RESPOND in time (event-loop stall under + # fan-out). NOTE: outcome is UNKNOWN, not "rejected" — the backend spawns the + # bg task before replying (internal.py), so a timeout may mean the task is + # already running (it then becomes an orphan recovered by the cleanup service; + # see the D4 follow-up). Lifted from a literal (#1022) so the deadline is + # tunable + "dispatch timed out" is distinguishable from "task ran long". + dispatch_timeout: float = field(default_factory=lambda: float(os.getenv( + "DISPATCH_TIMEOUT", "30" + ))) + + # Pre-check hook deadline (agent-side hook is 60s; this is the backend-call ceiling + # with headroom). Same backend stall that times out dispatch also times out the + # pre-check; lifted from the 70.0 literal (#1022, D1) for tunability. Fail-open — + # a pre-check timeout never produces a failed row, it just fires the schedule. + pre_check_timeout: float = field(default_factory=lambda: float(os.getenv( + "PRE_CHECK_TIMEOUT", "70" + ))) + # Misfire grace time — how long after a missed trigger APScheduler will # still execute the job. Default 30s is far too low for weekly cron jobs # whose container may restart. 3600s (1 hour) gives ample runway. diff --git a/src/scheduler/service.py b/src/scheduler/service.py index b849159d..3063fe4f 100644 --- a/src/scheduler/service.py +++ b/src/scheduler/service.py @@ -81,6 +81,13 @@ def _is_auth_failure(error_msg: str) -> bool: return any(ind in error_lower for ind in _AUTH_INDICATORS) +def _describe_exception(e: BaseException) -> str: + """Never let a blank/whitespace-stringifying exception (e.g. httpx timeouts, + whose str() is '') be persisted as an empty `error` (#1022). Falls back to + the type name. .strip() so whitespace-only messages also trip the fallback.""" + return str(e).strip() or f"{type(e).__name__} (no detail)" + + # #913: Polling-deadline fallback when the schedule's timeout_seconds is # NULL (= "inherit per-agent value"). The scheduler does not have the # per-agent value in this process; the backend enforces the real timeout. @@ -926,7 +933,9 @@ async def _dispatch_and_record_outcome(self, schedule, execution, effective_mess }) except Exception as e: - error_msg = str(e) + # #1022: never persist a blank error — the cron path's only handler + # is generic, so a blank-stringifying dispatch timeout lands here. + error_msg = _describe_exception(e) logger.error(f"Schedule {schedule.name} execution failed: {error_msg}") # SCHED-ASYNC-001: Check current status before overwriting. @@ -989,11 +998,15 @@ async def _run_pre_check(self, agent_name: str) -> Optional[dict]: response = await client.post( f"{config.backend_url}/api/internal/agents/{agent_name}/pre-check", headers=headers, - timeout=70.0, # agent-side timeout is 60s, give us headroom + # #1022: configurable (agent-side hook is 60s; default 70s headroom) + timeout=config.pre_check_timeout, ) except Exception as e: + # #1022: show the timeout type instead of the empty parens a blank + # httpx exception would log. logger.warning( - f"[pre-check] backend call for {agent_name} failed ({e}) — fail-open" + f"[pre-check] backend call for {agent_name} failed " + f"({_describe_exception(e)}) — fail-open" ) return None @@ -1047,7 +1060,8 @@ async def _call_backend_execute_task( Execute a task via the backend's internal TaskExecutionService endpoint. Uses async fire-and-forget dispatch with DB polling (SCHED-ASYNC-001): - 1. POST with async_mode=True and 30s timeout (dispatch only) + 1. POST with async_mode=True and the configured dispatch deadline + (config.dispatch_timeout, default 30s — dispatch only) 2. If backend accepts, poll DB every poll_interval seconds until done 3. Backward compatible: if backend returns sync result, use it directly @@ -1084,16 +1098,30 @@ async def _call_backend_execute_task( if execution_id: payload["execution_id"] = execution_id - # Step 1: Dispatch with short timeout (30s max for the HTTP round-trip) - dispatch_timeout = 30.0 + # Step 1: Dispatch with the configured deadline (#1022 — was a 30s + # literal). The async endpoint normally returns ~instantly; reaching + # this ceiling means the backend did not RESPOND in time, NOT that the + # task was rejected (the bg task may already be running — outcome + # unknown; see the D4 follow-up). + dispatch_timeout = config.dispatch_timeout async with httpx.AsyncClient() as client: - response = await client.post( - f"{config.backend_url}/api/internal/execute-task", - headers=headers, - json=payload, - timeout=dispatch_timeout, - ) + try: + response = await client.post( + f"{config.backend_url}/api/internal/execute-task", + headers=headers, + json=payload, + timeout=dispatch_timeout, + ) + except httpx.TimeoutException as e: + # Outcome UNKNOWN: the backend may have accepted+started the task + # before the timeout (orphan; see D4 follow-up). Name the + # threshold + subtype so the persisted `error` is never blank + # (httpx timeouts str() to '' — the #1022 silent-failure root). + raise Exception( + f"dispatch to /api/internal/execute-task timed out after " + f"{dispatch_timeout}s ({type(e).__name__}) — outcome unknown" + ) from e if response.status_code != 200: error_text = response.text[:500] if response.text else f"HTTP {response.status_code}" @@ -1461,7 +1489,7 @@ async def _execute_retry( self.db.update_execution_status( execution_id=retry_execution.id, status=ExecutionStatus.FAILED, - error=str(e)[:2000] + error=_describe_exception(e)[:2000] # #1022: never blank ) def _recover_pending_retries(self): @@ -1880,7 +1908,10 @@ async def _execute_process_schedule_with_lock(self, schedule_id: str): }) except Exception as e: - error_msg = str(e) + # #1022 defense-in-depth: process-schedule timeouts already have a + # dedicated non-empty handler above; this only sees other blank + # exceptions, normalized here for consistency. + error_msg = _describe_exception(e) logger.error(f"Process schedule {schedule.process_name} execution failed: {error_msg}") self.db.update_process_schedule_execution( diff --git a/tests/scheduler_tests/test_async_dispatch.py b/tests/scheduler_tests/test_async_dispatch.py index 9d19217a..57bc59c9 100644 --- a/tests/scheduler_tests/test_async_dispatch.py +++ b/tests/scheduler_tests/test_async_dispatch.py @@ -18,6 +18,7 @@ sys.path.insert(0, _src_path) import asyncio +import httpx import pytest from unittest.mock import MagicMock, AsyncMock, patch, PropertyMock from datetime import datetime @@ -685,3 +686,222 @@ async def long_running(): assert task.cancelled() or task.done() assert len(service._active_poll_tasks) == 0 + + +class TestDescribeException: + """#1022: _describe_exception never persists a blank/whitespace error.""" + + def test_blank_stringifying_exception_falls_back_to_type_name(self): + # httpx timeout exceptions str() to '' — the exact #1022 trigger. + from scheduler.service import _describe_exception + + msg = _describe_exception(httpx.ReadTimeout("")) + assert msg.strip() # non-empty + assert "ReadTimeout" in msg + + def test_whitespace_only_exception_falls_back_to_type_name(self): + from scheduler.service import _describe_exception + + msg = _describe_exception(Exception(" ")) + assert msg.strip() + assert "Exception" in msg + + def test_normal_exception_message_preserved(self): + from scheduler.service import _describe_exception + + assert _describe_exception(Exception("boom")) == "boom" + + +class TestDispatchTimeout: + """#1022: dispatch POST timeout → descriptive raise + config-driven deadline.""" + + @pytest.mark.asyncio + async def test_dispatch_timeout_raises_descriptive_error( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + """A blank-stringifying httpx timeout on the POST becomes a descriptive, + non-empty Exception naming the threshold + subtype (was the silent bug).""" + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + execution = db_with_data.create_execution( + schedule_id="schedule-1", + agent_name="test-agent", + message="Test", + ) + + with patch("httpx.AsyncClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client.post.side_effect = httpx.ReadTimeout("") # str() == '' + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_cls.return_value = mock_client + + with pytest.raises(Exception) as exc_info: + await service._call_backend_execute_task( + agent_name="test-agent", + message="Test", + triggered_by="schedule", + execution_id=execution.id, + ) + + msg = str(exc_info.value) + assert "timed out" in msg + assert "30" in msg # the default dispatch_timeout value names the threshold + assert "ReadTimeout" in msg + + @pytest.mark.asyncio + async def test_dispatch_uses_config_timeout( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + """The dispatch deadline comes from config.dispatch_timeout, not a literal. + Complements test_dispatch_uses_short_timeout (30.0 default).""" + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "status": "accepted", + "execution_id": "exec-1", + "async_mode": True, + } + + execution = db_with_data.create_execution( + schedule_id="schedule-1", + agent_name="test-agent", + message="Test", + ) + db_with_data.update_execution_status( + execution_id=execution.id, + status=ExecutionStatus.SUCCESS, + ) + + with patch("httpx.AsyncClient") as mock_client_cls, \ + patch("scheduler.service.config.dispatch_timeout", 12.5): + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_cls.return_value = mock_client + + await service._call_backend_execute_task( + agent_name="test-agent", + message="Test", + triggered_by="schedule", + execution_id=execution.id, + ) + + call_kwargs = mock_client.post.call_args + assert call_kwargs[1]["timeout"] == 12.5 + + +class TestDispatchTimeoutRegression: + """#1022 regression — the exact failure signature, INVERTED. + + A dispatch httpx timeout (str()=='') driven end-to-end must persist a + NON-EMPTY `error` on a FAILED row, instead of the silent `error=''`. + """ + + @pytest.mark.asyncio + async def test_dispatch_timeout_persists_nonempty_error( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + mock_lock = MagicMock() + mock_lock_manager.try_acquire_schedule_lock = MagicMock(return_value=mock_lock) + + with patch("httpx.AsyncClient") as mock_client_cls, \ + patch.object(service, "_publish_event", new_callable=AsyncMock): + mock_client = AsyncMock() + # Every POST (pre-check fail-open + dispatch) blank-stringifies. + mock_client.post.side_effect = httpx.ReadTimeout("") + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_cls.return_value = mock_client + + await service._execute_schedule_with_lock("schedule-1") + + with db_with_data.get_connection() as conn: + cursor = conn.cursor() + cursor.execute(""" + SELECT status, error FROM schedule_executions + WHERE schedule_id = 'schedule-1' + ORDER BY started_at DESC + LIMIT 1 + """) + row = cursor.fetchone() + + assert row is not None + assert row["status"] == ExecutionStatus.FAILED + # The #1022 signature inverted: error must NOT be blank. + assert row["error"] is not None and row["error"].strip() != "" + assert "timed out" in row["error"] + + +class TestRetryDispatchTimeoutRegression: + """#1022 — the second persistence path the fix touched: `_execute_retry`. + + A blank-stringifying timeout raised by the backend dispatch during a + retry attempt must land a NON-EMPTY `error` on the retry's FAILED row + (service.py:`error=_describe_exception(e)[:2000]`), not the silent + `error=str(e)=''` it replaced. + """ + + @pytest.mark.asyncio + async def test_retry_dispatch_timeout_persists_nonempty_error( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + # The dispatch inside the retry blank-stringifies (the #1022 trigger). + service._call_backend_execute_task = AsyncMock( + side_effect=httpx.ReadTimeout("") + ) + + await service._execute_retry( + original_execution_id="orig-exec", + failed_execution_id="failed-exec", + schedule_id="schedule-1", + agent_name="test-agent", + message="Retry me", + timeout_seconds=None, + model="claude-sonnet-4-6", + allowed_tools=[], + next_attempt_number=2, + ) + + with db_with_data.get_connection() as conn: + cursor = conn.cursor() + cursor.execute(""" + SELECT status, error FROM schedule_executions + WHERE schedule_id = 'schedule-1' AND triggered_by = 'retry' + ORDER BY started_at DESC + LIMIT 1 + """) + row = cursor.fetchone() + + assert row is not None, "retry execution row was never created" + assert row["status"] == ExecutionStatus.FAILED + # #1022 inverted: a blank-stringifying timeout must NOT persist blank. + assert row["error"] is not None and row["error"].strip() != "" + assert "ReadTimeout" in row["error"] diff --git a/tests/scheduler_tests/test_config.py b/tests/scheduler_tests/test_config.py index 2ca96304..fdc53f67 100644 --- a/tests/scheduler_tests/test_config.py +++ b/tests/scheduler_tests/test_config.py @@ -26,6 +26,9 @@ def test_default_values(self): assert config.lock_auto_renewal is True assert config.health_port == 8001 assert config.default_timezone == "UTC" + # #1022: dispatch + pre-check deadlines lifted from literals to config. + assert config.dispatch_timeout == 30.0 + assert config.pre_check_timeout == 70.0 def test_env_override(self): """Test that environment variables override defaults.""" @@ -53,6 +56,18 @@ def test_agent_timeout(self): config = SchedulerConfig() assert config.agent_timeout == 1800.0 + def test_dispatch_timeout_override(self): + """#1022: DISPATCH_TIMEOUT overrides the dispatch deadline (float).""" + with patch.dict(os.environ, {"REDIS_URL": _TEST_REDIS_URL, "DISPATCH_TIMEOUT": "12.5"}, clear=True): + config = SchedulerConfig() + assert config.dispatch_timeout == 12.5 + + def test_pre_check_timeout_override(self): + """#1022: PRE_CHECK_TIMEOUT overrides the pre-check deadline (float).""" + with patch.dict(os.environ, {"REDIS_URL": _TEST_REDIS_URL, "PRE_CHECK_TIMEOUT": "90"}, clear=True): + config = SchedulerConfig() + assert config.pre_check_timeout == 90.0 + def test_publish_events_default(self): """Test event publishing is enabled by default.""" with patch.dict(os.environ, {"REDIS_URL": _TEST_REDIS_URL}, clear=True): diff --git a/tests/scheduler_tests/test_pre_check.py b/tests/scheduler_tests/test_pre_check.py index 83e8ab2b..ae7c8027 100644 --- a/tests/scheduler_tests/test_pre_check.py +++ b/tests/scheduler_tests/test_pre_check.py @@ -22,6 +22,7 @@ from unittest.mock import AsyncMock, MagicMock, patch +import httpx import pytest from scheduler.models import ExecutionStatus @@ -134,6 +135,40 @@ async def test_malformed_json_returns_none(self, db_with_data): assert decision is None +class TestRunPreCheckTimeout: + """#1022: the pre-check POST deadline is config-driven (was a 70.0 literal), + and a blank-stringifying timeout — the exact #1022 trigger — still fails + open instead of raising. (TestRunPreCheckTranslation already covers the + fail-open path with a *non-blank* exception; this pins the blank case.)""" + + @pytest.mark.asyncio + async def test_post_uses_configured_pre_check_timeout(self, db_with_data): + """The POST timeout comes from config.pre_check_timeout, not a literal.""" + svc = _build_svc(db_with_data) + + response = MagicMock() + response.status_code = 200 + response.json = MagicMock(return_value={"hook_present": False}) + post = AsyncMock(return_value=response) + client_ctx = AsyncMock() + client_ctx.__aenter__.return_value.post = post + + with patch("scheduler.service.httpx.AsyncClient", return_value=client_ctx), \ + patch("scheduler.service.config.pre_check_timeout", 42.0): + await svc._run_pre_check("test-agent") + + assert post.call_args.kwargs["timeout"] == 42.0 + + @pytest.mark.asyncio + async def test_blank_stringifying_timeout_fails_open(self, db_with_data): + """A blank httpx timeout (str()=='') must fail open (return None), + never propagate — fail-open is structural for the pre-check gate.""" + svc = _build_svc(db_with_data) + with _mock_httpx_post(raise_exc=httpx.ReadTimeout("")): + decision = await svc._run_pre_check("test-agent") + assert decision is None + + # --------------------------------------------------------------------------- # SchedulerService._execute_schedule_with_lock pre-check branch # ---------------------------------------------------------------------------