Abilityai · vybe · Jun 16, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
@@ -40,6 +40,10 @@ services:
       - SLACK_CLIENT_ID=${SLACK_CLIENT_ID}
       - SLACK_CLIENT_SECRET=${SLACK_CLIENT_SECRET}
       - SLACK_SIGNING_SECRET=${SLACK_SIGNING_SECRET:-}
+      # Socket Mode concurrent WebSocket count (#244); clamped to [1,10], default 2.
+      # Mirrors docker-compose.yml — without this line the .env knob is inert in
+      # prod (the #1039 packaging-gap class).
+      - SLACK_SOCKET_CONNECTION_COUNT=${SLACK_SOCKET_CONNECTION_COUNT:-2}
       - CREDENTIAL_ENCRYPTION_KEY=${CREDENTIAL_ENCRYPTION_KEY:-}
       - EMAIL_PROVIDER=${EMAIL_PROVIDER:-resend}
       - RESEND_API_KEY=${RESEND_API_KEY:-}
@@ -93,6 +97,14 @@ services:
       - PUBLIC_CHAT_URL=${PUBLIC_CHAT_URL:-}
       # Frontend URL for post-OAuth redirects
       - FRONTEND_URL=${FRONTEND_URL:-}
+      # Backend's own public origin — used to build OAuth redirect URIs
+      # ({BACKEND_URL}/api/oauth/{provider}/callback, read by config.py). Mirrors
+      # docker-compose.yml; without this line the .env value is inert in prod (the
+      # #1039 packaging-gap class) and OAuth callbacks fall back to localhost — set
+      # this to your public backend URL. The scheduler reads BACKEND_URL too but
+      # keeps its own internal default (http://backend:8000); intentionally not
+      # wired here so it still reaches the backend over the Docker network.
+      - BACKEND_URL=${BACKEND_URL:-http://localhost:8000}
       # Internal API shared secret (C-003) - for scheduler/agent communication
       - INTERNAL_API_SECRET=${INTERNAL_API_SECRET}
       # Configurable database backend (#300, experimental) — mirrors
@@ -103,6 +115,12 @@ services:
       - DATABASE_URL=${DATABASE_URL:-}
       - DB_POOL_SIZE=${DB_POOL_SIZE:-10}
       - DB_MAX_OVERFLOW=${DB_MAX_OVERFLOW:-20}
+      # Host telemetry (/api/telemetry) — container-stats cache freshness (s) and
+      # max concurrent Docker stat fetches per refresh. Mirrors docker-compose.yml;
+      # without these lines the .env knobs are inert in prod (the #1039
+      # packaging-gap class).
+      - TELEMETRY_CONTAINER_STATS_TTL=${TELEMETRY_CONTAINER_STATS_TTL:-10}
+      - TELEMETRY_DOCKER_POOL_SIZE=${TELEMETRY_DOCKER_POOL_SIZE:-16}
     volumes:
       - /var/run/docker.sock:/var/run/docker.sock:ro
       - ./config/agent-templates:/agent-configs/templates:ro
@@ -279,6 +297,9 @@ services:
       - LOG_LEVEL=${SCHEDULER_LOG_LEVEL:-INFO}
       - LOCK_TIMEOUT=600
       - AGENT_TIMEOUT=900
+      # #1022: scheduler→backend dispatch + pre-check deadlines (operator-tunable).
+      - DISPATCH_TIMEOUT=${DISPATCH_TIMEOUT:-30}
+      - PRE_CHECK_TIMEOUT=${PRE_CHECK_TIMEOUT:-70}
       - MISFIRE_GRACE_TIME=3600
       - PUBLISH_EVENTS=true
       - INTERNAL_API_SECRET=${INTERNAL_API_SECRET}

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -83,12 +83,26 @@ services:
       # Public access / OAuth callbacks
       - PUBLIC_CHAT_URL=${PUBLIC_CHAT_URL:-}
       - FRONTEND_URL=${FRONTEND_URL:-}
+      # Backend's own public origin — used to build OAuth redirect URIs
+      # ({BACKEND_URL}/api/oauth/{provider}/callback, read by config.py). Without
+      # this line the .env value is inert (the #1039 packaging-gap class) and
+      # callbacks fall back to the localhost default. The scheduler reads BACKEND_URL
+      # too but keeps its own internal default (http://backend:8000); intentionally
+      # not wired here so it still reaches the backend over the Docker network.
+      - BACKEND_URL=${BACKEND_URL:-http://localhost:8000}
       # CORS (comma-separated extra origins)
       - EXTRA_CORS_ORIGINS=${EXTRA_CORS_ORIGINS:-}
       # Slack channel adapter
       - SLACK_SIGNING_SECRET=${SLACK_SIGNING_SECRET:-}
+      # Socket Mode concurrent WebSocket count (#244); clamped to [1,10], default 2
+      - SLACK_SOCKET_CONNECTION_COUNT=${SLACK_SOCKET_CONNECTION_COUNT:-2}
       # SSH access host override
       - SSH_HOST=${SSH_HOST:-}
+      # Host telemetry (/api/telemetry) — container-stats cache freshness (s) and
+      # max concurrent Docker stat fetches per refresh. Without these lines the
+      # .env knobs are inert (the #1039 packaging-gap class).
+      - TELEMETRY_CONTAINER_STATS_TTL=${TELEMETRY_CONTAINER_STATS_TTL:-10}
+      - TELEMETRY_DOCKER_POOL_SIZE=${TELEMETRY_DOCKER_POOL_SIZE:-16}
       # Canary invariant harness (CANARY-001 / Issue #411).
       # When 1, services/canary_service.py runs the 5-min watcher loop on
       # staging/dev. Default 0 — production users see no canary activity.
@@ -328,6 +342,9 @@ services:
       - LOG_LEVEL=${SCHEDULER_LOG_LEVEL:-INFO}
       - LOCK_TIMEOUT=600
       - AGENT_TIMEOUT=900
+      # #1022: scheduler→backend dispatch + pre-check deadlines (operator-tunable).
+      - DISPATCH_TIMEOUT=${DISPATCH_TIMEOUT:-30}
+      - PRE_CHECK_TIMEOUT=${PRE_CHECK_TIMEOUT:-70}
       - MISFIRE_GRACE_TIME=3600
       - PUBLISH_EVENTS=true
       - INTERNAL_API_SECRET=${INTERNAL_API_SECRET:-}

diff --git a/docs/memory/feature-flows.md b/docs/memory/feature-flows.md
@@ -11,6 +11,7 @@
 
 | Date | ID | Feature | Flow |
 |------|-----|---------|------|
+| 2026-06-14 | #1022 | fix(scheduler): persist a descriptive `error` on dispatch timeout — a dispatch `httpx.TimeoutException` (whose `str()` is `''`) previously landed in the cron path's generic handler and persisted a **blank** `error`. Now re-raised before that handler as a named non-blank message (`"dispatch to /api/internal/execute-task timed out after {N}s — outcome unknown"`); outcome is genuinely UNKNOWN (backend spawns the bg task before replying → may already be running → orphan recovered by cleanup). New `_describe_exception()` helper (type-name fallback) normalizes any blank-stringifying exception across all execution/retry/process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Scheduler-only (`src/scheduler/`); +270 lines of tests (incl. pre-check config-deadline + retry-path blank-error regressions). | [scheduler-service.md](feature-flows/scheduler-service.md), [scheduler-pre-check.md](feature-flows/scheduler-pre-check.md) |
 | 2026-06-11 | #858 | fix: first-time setup token silently lost — `docker/backend/Dockerfile` had drifted and lost `ENV PYTHONUNBUFFERED=1` (which `docker/scheduler/Dockerfile` still set), so CPython block-buffered the lifespan's stdout to the Docker log pipe (~8KB) and the printed setup token never reached `docker logs`, deadlocking fresh installs (the only documented path through the `routers/setup.py` token gate). Two-layer fix: (1) restore `PYTHONUNBUFFERED=1` (catches every `print()`); (2) the setup-token block + ~76 other lifespan `print()` calls now emit via the structured `logger` — the token as a single multi-line `logger.warning` **relocated to immediately after `setup_logging()`**, before the event-bus/audit-write startup that could otherwise hang and suppress it (the `StreamHandler` flushes per record, so it's immune to future Dockerfile drift and flows through Vector). `setup_opentelemetry()`'s import-time print + the `register_enterprise` prints stay `print(..., flush=True)` (they run before `setup_logging()`). New `unit/test_858_dockerfile_unbuffered.py` backend↔scheduler parity guard (2 tests). Note: stdout→stderr stream move for the converted lines (Docker/Vector capture both). Known follow-up #1165: prod runs uvicorn `--workers 2`, so the per-process token is still ~50% flaky until unified. | [first-time-setup.md](feature-flows/first-time-setup.md) |
 | 2026-06-10 | #1130 | fix: retired `gemini-2.0-flash` replaced with env-configurable models — `GEMINI_TEXT_MODEL` (image-gen prompt refinement) + `GEMINI_TRANSCRIPTION_MODEL` (Telegram voice), both default `gemini-3.5-flash`, defined in `config.py`, empty-string-safe wiring in both compose files (#1076 pattern). | [image-generation.md](feature-flows/image-generation.md), [telegram-integration.md](feature-flows/telegram-integration.md) |
 | 2026-06-10 | #1108 | feat(ui): Agent Detail **Guardrails** tab renamed to **Settings** — sectioned config home. New `components/settings/SettingsPanel.vue` renders `GuardrailsPanel` unchanged as section #1; future per-agent settings land as additive sections, not new tabs. `?tab=guardrails` deep links alias to `settings` via `TAB_ALIASES`. Pure frontend. | [agent-guardrails.md](feature-flows/agent-guardrails.md) |

diff --git a/docs/memory/feature-flows/scheduler-pre-check.md b/docs/memory/feature-flows/scheduler-pre-check.md
@@ -33,7 +33,7 @@ Returns:
 ## Scheduler Layer
 **Service**: `src/scheduler/service.py` — `_run_pre_check(agent_name)`
 
-Calls the backend's internal endpoint (not the agent directly — topology stays "scheduler → backend → agent"). Translates the backend response into a scheduler decision:
+Calls the backend's internal endpoint (not the agent directly — topology stays "scheduler → backend → agent"). The scheduler→backend HTTP call uses `config.pre_check_timeout` (env `PRE_CHECK_TIMEOUT`, default 70s — the agent-side hook deadline is 60s, this adds headroom; lifted from a literal in #1022). A timeout fails open (`None` → fire as usual) and is logged with the exception **type** rather than the empty parens a bare `httpx` timeout would otherwise print (#1022). Translates the backend response into a scheduler decision:
 
 | Backend response | Scheduler decision |
 |---|---|

diff --git a/docs/memory/feature-flows/scheduler-service.md b/docs/memory/feature-flows/scheduler-service.md
@@ -107,6 +107,8 @@ As a **platform administrator**, I want **scheduled tasks to execute exactly onc
 | `SCHEDULE_RELOAD_INTERVAL` | `60` | Seconds between schedule sync checks |
 | `AGENT_TIMEOUT` | `900` | Default agent request timeout (15 min) |
 | `POLL_INTERVAL` | `10` | Seconds between DB polls for async task completion (SCHED-ASYNC-001) |
+| `DISPATCH_TIMEOUT` | `30` | HTTP deadline for the scheduler→backend `POST /api/internal/execute-task` round-trip (dispatch only; the async endpoint returns ~instantly). Reaching it means the backend did not respond — outcome is **UNKNOWN**, not "rejected" (#1022) |
+| `PRE_CHECK_TIMEOUT` | `70` | HTTP deadline for the scheduler→backend pre-check call (agent-side hook is 60s; 10s headroom). Fail-open (#1022) |
 | `MISFIRE_GRACE_TIME` | `3600` | Seconds after a missed trigger that APScheduler will still execute (Issue #145) |
 | `BACKEND_URL` | `http://backend:8000` | Backend API URL for process executions and task delegation |
 | `INTERNAL_API_SECRET` | _(empty)_ | Shared secret for backend internal API auth (C-003) |
@@ -293,13 +295,15 @@ async def _execute_schedule_with_lock(self, schedule_id: str, triggered_by: str
             self.db.update_execution_status(execution_id=execution.id, status=ExecutionStatus.FAILED, ...)
 ```
 
-**Backend Task Delegation** (`service.py:760-833`):
+**Backend Task Delegation** (`_call_backend_execute_task`):
 
 Uses async fire-and-forget dispatch (SCHED-ASYNC-001):
-1. `POST /api/internal/execute-task` with `async_mode=True` and 30s HTTP timeout
+1. `POST /api/internal/execute-task` with `async_mode=True` and the configured dispatch deadline (`config.dispatch_timeout`, env `DISPATCH_TIMEOUT`, default 30s — was a literal before #1022)
 2. If backend accepts (`{"status": "accepted", "async_mode": true}`), poll DB
 3. Backward compatible: if backend returns sync result, use directly
 
+A dispatch `httpx.TimeoutException` is re-raised as a **named, non-blank** error (`"dispatch to /api/internal/execute-task timed out after {N}s ({TimeoutType}) — outcome unknown"`) *before* it can reach the generic `except` handler. The outcome is genuinely unknown: the backend spawns the background task before replying (`internal.py`), so a dispatch timeout may mean the task is already running and will surface as an orphan recovered by the cleanup service. This closes the #1022 silent-failure — bare `httpx` timeouts stringify to `''`, which previously persisted as a blank `error`. Defense-in-depth: `_describe_exception()` normalizes any other blank-stringifying exception (falls back to the type name) across every execution, retry, and process-schedule error path.
+
 **DB Polling** (`service.py:835-887`):
 ```python
 async def _poll_execution_completion(self, execution_id, timeout_seconds):
@@ -954,7 +958,7 @@ typing-extensions>=4.9.0
 | `test_locking.py` | Distributed locks | Redis lock acquire/release |
 | `test_agent_client.py` | HTTP client | Agent communication |
 | `test_service.py` | Scheduler service | Full integration tests |
-| `test_async_dispatch.py` | Async dispatch + polling | SCHED-ASYNC-001 (11 tests) |
+| `test_async_dispatch.py` | Async dispatch + polling | SCHED-ASYNC-001 + dispatch-timeout error persistence (#1022): `_describe_exception()` branches, descriptive-raise on `ReadTimeout('')`, config-driven deadline, non-empty-error regression for both the main dispatch and the retry path (22 tests) |
 | `conftest.py` | Fixtures | Mock database, Redis, models |
 
 ### Running Tests
@@ -985,6 +989,8 @@ docker compose -f docker/scheduler/docker-compose.test.yml up
 | Agent timeout | Update status=failed (with overwrite guard), publish event | Error recorded |
 | Auth failure detected | Log auth-specific error, publish event | Error recorded with auth context |
 | **TCP disconnect (SCHED-ASYNC-001)** | **Check DB before overwriting -- if backend already finalized, preserve status** | **No false failures** |
+| **Dispatch timeout (#1022)** | Re-raised as a named non-blank error *before* the generic handler; outcome UNKNOWN (task may already be running → orphan) | Descriptive `error` persisted (never blank); cleanup service recovers any orphan |
+| **Blank-stringifying exception (e.g. httpx timeout)** | `_describe_exception()` falls back to the exception type name | `error` never persisted blank (#1022) |
 | **Polling deadline exceeded** | Raise exception, overwrite guard checks DB status | Error recorded (if genuinely stale) |
 | Process backend HTTP error | Update process execution as failed, publish event | Error recorded |
 | Process backend timeout | Update process execution as failed, publish event | Error recorded |
@@ -1078,6 +1084,7 @@ The embedded scheduler (`src/backend/services/scheduler_service.py`) has been co
 
 | Date | Change |
 |------|--------|
+| 2026-06-14 | **Descriptive error on dispatch timeout (#1022)**: a dispatch `httpx.TimeoutException` is re-raised with a named, non-blank message (`"dispatch ... timed out after {N}s — outcome unknown"`) so the silent blank `error` (bare httpx timeouts `str()` to `''`) no longer lands. New `_describe_exception()` helper normalizes blank exceptions across all execution / retry / process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Outcome of a dispatch timeout is UNKNOWN (backend may have already started the bg task → orphan recovered by cleanup). |
 | 2026-05-09 | **MCP update_agent_schedule fixes (aaad4f6, #741/#742)**: (1) `src/mcp-server/src/tools/schedules.ts` — added explicit warning to `enabled` field description in `update_agent_schedule` so AI models do not inadvertently re-enable a disabled schedule when updating unrelated fields (e.g. cron expression). (2) `src/backend/routers/schedules.py` — added `max_retries` and `retry_delay_seconds` to `ScheduleUpdateRequest`; both were missing, so `exclude_unset=True` silently dropped them before they reached `db.update_schedule()`, making retry config uneditable via MCP. |
 | 2026-04-23 | **Retry Default Flipped (#476)**: `max_retries` default `1 → 0`. Both new and existing schedules are opt-in now. Scheduled agents typically catch up on next tick; retries amplified load during multi-hour outages. |
 | 2026-04-14 | **Automatic Retry (RETRY-001)**: Added Flow 10 documenting configurable retry mechanism for failed executions. New fields: max_retries, retry_delay_seconds, attempt_number, retry_of_execution_id, retry_scheduled_at. New status: pending_retry. |