From 51c18031de920f4dad1078917a0947020e147a89 Mon Sep 17 00:00:00 2001 From: Andrii Pasternak Date: Sat, 13 Jun 2026 15:58:11 +0100 Subject: [PATCH 1/9] fix(scheduler): persist descriptive error on dispatch timeout (#1022) Cron-triggered executions could land as schedule_executions.status='failed' with a blank `error` and no `response`, giving operators zero diagnostics. Two scheduler defects combined: the scheduler->backend execute-task POST had a hardcoded 30s deadline with no try/except, so an httpx.TimeoutException propagated raw; and the generic failure handler recorded str(e), which is the empty string for httpx timeouts -- persisted verbatim. - config.py: add tunable dispatch_timeout (DISPATCH_TIMEOUT, default 30) and pre_check_timeout (PRE_CHECK_TIMEOUT, default 70) float fields, following the agent_timeout default_factory idiom. - service.py: _describe_exception() normalizer (never persist a blank/whitespace error -- falls back to the type name); narrow try/except httpx.TimeoutException around the dispatch POST that re-raises an outcome-neutral, descriptive message ("... timed out after Ns (ReadTimeout) -- outcome unknown") with `from e`; source the deadline from config; apply the normalizer at the cron, retry, and process-schedule generic handlers; _run_pre_check uses config.pre_check_timeout + a typed fail-open log. Stale 30s/70s comments updated to reference config. - docker-compose{,.prod}.yml: declare DISPATCH_TIMEOUT/PRE_CHECK_TIMEOUT as host-overridable env vars on the scheduler service. The backend event-loop stall that triggers the timeout, and the dispatch-timeout control-flow gap (orphan task / no retry / read-then-write race), are out of scope and tracked as a follow-up. Tests (tests/scheduler_tests/): config asserts (both fields default + override), _describe_exception both branches, descriptive-raise on ReadTimeout(''), config-driven dispatch timeout, and a regression proving the persisted error is non-empty end-to-end (the #1022 signature, inverted). 189 passed. Closes #1022 Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-compose.prod.yml | 3 + docker-compose.yml | 3 + src/scheduler/config.py | 20 +++ src/scheduler/service.py | 59 +++++-- tests/scheduler_tests/test_async_dispatch.py | 166 +++++++++++++++++++ tests/scheduler_tests/test_config.py | 15 ++ 6 files changed, 252 insertions(+), 14 deletions(-) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index d4a2e3c45..acb83ac17 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -279,6 +279,9 @@ services: - LOG_LEVEL=${SCHEDULER_LOG_LEVEL:-INFO} - LOCK_TIMEOUT=600 - AGENT_TIMEOUT=900 + # #1022: scheduler→backend dispatch + pre-check deadlines (operator-tunable). + - DISPATCH_TIMEOUT=${DISPATCH_TIMEOUT:-30} + - PRE_CHECK_TIMEOUT=${PRE_CHECK_TIMEOUT:-70} - MISFIRE_GRACE_TIME=3600 - PUBLISH_EVENTS=true - INTERNAL_API_SECRET=${INTERNAL_API_SECRET} diff --git a/docker-compose.yml b/docker-compose.yml index 5258af7b5..1f4a66249 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -328,6 +328,9 @@ services: - LOG_LEVEL=${SCHEDULER_LOG_LEVEL:-INFO} - LOCK_TIMEOUT=600 - AGENT_TIMEOUT=900 + # #1022: scheduler→backend dispatch + pre-check deadlines (operator-tunable). + - DISPATCH_TIMEOUT=${DISPATCH_TIMEOUT:-30} + - PRE_CHECK_TIMEOUT=${PRE_CHECK_TIMEOUT:-70} - MISFIRE_GRACE_TIME=3600 - PUBLISH_EVENTS=true - INTERNAL_API_SECRET=${INTERNAL_API_SECRET:-} diff --git a/src/scheduler/config.py b/src/scheduler/config.py index 194dd2ce1..5a1f83f7b 100644 --- a/src/scheduler/config.py +++ b/src/scheduler/config.py @@ -75,6 +75,26 @@ class SchedulerConfig: "POLL_DEADLINE_BUFFER", "60" ))) + # Dispatch deadline for the scheduler→backend POST /api/internal/execute-task + # round-trip. The async endpoint normally returns ~instantly; reaching this + # ceiling means the backend did not RESPOND in time (event-loop stall under + # fan-out). NOTE: outcome is UNKNOWN, not "rejected" — the backend spawns the + # bg task before replying (internal.py), so a timeout may mean the task is + # already running (it then becomes an orphan recovered by the cleanup service; + # see the D4 follow-up). Lifted from a literal (#1022) so the deadline is + # tunable + "dispatch timed out" is distinguishable from "task ran long". + dispatch_timeout: float = field(default_factory=lambda: float(os.getenv( + "DISPATCH_TIMEOUT", "30" + ))) + + # Pre-check hook deadline (agent-side hook is 60s; this is the backend-call ceiling + # with headroom). Same backend stall that times out dispatch also times out the + # pre-check; lifted from the 70.0 literal (#1022, D1) for tunability. Fail-open — + # a pre-check timeout never produces a failed row, it just fires the schedule. + pre_check_timeout: float = field(default_factory=lambda: float(os.getenv( + "PRE_CHECK_TIMEOUT", "70" + ))) + # Misfire grace time — how long after a missed trigger APScheduler will # still execute the job. Default 30s is far too low for weekly cron jobs # whose container may restart. 3600s (1 hour) gives ample runway. diff --git a/src/scheduler/service.py b/src/scheduler/service.py index b849159dd..3063fe4f1 100644 --- a/src/scheduler/service.py +++ b/src/scheduler/service.py @@ -81,6 +81,13 @@ def _is_auth_failure(error_msg: str) -> bool: return any(ind in error_lower for ind in _AUTH_INDICATORS) +def _describe_exception(e: BaseException) -> str: + """Never let a blank/whitespace-stringifying exception (e.g. httpx timeouts, + whose str() is '') be persisted as an empty `error` (#1022). Falls back to + the type name. .strip() so whitespace-only messages also trip the fallback.""" + return str(e).strip() or f"{type(e).__name__} (no detail)" + + # #913: Polling-deadline fallback when the schedule's timeout_seconds is # NULL (= "inherit per-agent value"). The scheduler does not have the # per-agent value in this process; the backend enforces the real timeout. @@ -926,7 +933,9 @@ async def _dispatch_and_record_outcome(self, schedule, execution, effective_mess }) except Exception as e: - error_msg = str(e) + # #1022: never persist a blank error — the cron path's only handler + # is generic, so a blank-stringifying dispatch timeout lands here. + error_msg = _describe_exception(e) logger.error(f"Schedule {schedule.name} execution failed: {error_msg}") # SCHED-ASYNC-001: Check current status before overwriting. @@ -989,11 +998,15 @@ async def _run_pre_check(self, agent_name: str) -> Optional[dict]: response = await client.post( f"{config.backend_url}/api/internal/agents/{agent_name}/pre-check", headers=headers, - timeout=70.0, # agent-side timeout is 60s, give us headroom + # #1022: configurable (agent-side hook is 60s; default 70s headroom) + timeout=config.pre_check_timeout, ) except Exception as e: + # #1022: show the timeout type instead of the empty parens a blank + # httpx exception would log. logger.warning( - f"[pre-check] backend call for {agent_name} failed ({e}) — fail-open" + f"[pre-check] backend call for {agent_name} failed " + f"({_describe_exception(e)}) — fail-open" ) return None @@ -1047,7 +1060,8 @@ async def _call_backend_execute_task( Execute a task via the backend's internal TaskExecutionService endpoint. Uses async fire-and-forget dispatch with DB polling (SCHED-ASYNC-001): - 1. POST with async_mode=True and 30s timeout (dispatch only) + 1. POST with async_mode=True and the configured dispatch deadline + (config.dispatch_timeout, default 30s — dispatch only) 2. If backend accepts, poll DB every poll_interval seconds until done 3. Backward compatible: if backend returns sync result, use it directly @@ -1084,16 +1098,30 @@ async def _call_backend_execute_task( if execution_id: payload["execution_id"] = execution_id - # Step 1: Dispatch with short timeout (30s max for the HTTP round-trip) - dispatch_timeout = 30.0 + # Step 1: Dispatch with the configured deadline (#1022 — was a 30s + # literal). The async endpoint normally returns ~instantly; reaching + # this ceiling means the backend did not RESPOND in time, NOT that the + # task was rejected (the bg task may already be running — outcome + # unknown; see the D4 follow-up). + dispatch_timeout = config.dispatch_timeout async with httpx.AsyncClient() as client: - response = await client.post( - f"{config.backend_url}/api/internal/execute-task", - headers=headers, - json=payload, - timeout=dispatch_timeout, - ) + try: + response = await client.post( + f"{config.backend_url}/api/internal/execute-task", + headers=headers, + json=payload, + timeout=dispatch_timeout, + ) + except httpx.TimeoutException as e: + # Outcome UNKNOWN: the backend may have accepted+started the task + # before the timeout (orphan; see D4 follow-up). Name the + # threshold + subtype so the persisted `error` is never blank + # (httpx timeouts str() to '' — the #1022 silent-failure root). + raise Exception( + f"dispatch to /api/internal/execute-task timed out after " + f"{dispatch_timeout}s ({type(e).__name__}) — outcome unknown" + ) from e if response.status_code != 200: error_text = response.text[:500] if response.text else f"HTTP {response.status_code}" @@ -1461,7 +1489,7 @@ async def _execute_retry( self.db.update_execution_status( execution_id=retry_execution.id, status=ExecutionStatus.FAILED, - error=str(e)[:2000] + error=_describe_exception(e)[:2000] # #1022: never blank ) def _recover_pending_retries(self): @@ -1880,7 +1908,10 @@ async def _execute_process_schedule_with_lock(self, schedule_id: str): }) except Exception as e: - error_msg = str(e) + # #1022 defense-in-depth: process-schedule timeouts already have a + # dedicated non-empty handler above; this only sees other blank + # exceptions, normalized here for consistency. + error_msg = _describe_exception(e) logger.error(f"Process schedule {schedule.process_name} execution failed: {error_msg}") self.db.update_process_schedule_execution( diff --git a/tests/scheduler_tests/test_async_dispatch.py b/tests/scheduler_tests/test_async_dispatch.py index 9d19217a7..ce2d25d8d 100644 --- a/tests/scheduler_tests/test_async_dispatch.py +++ b/tests/scheduler_tests/test_async_dispatch.py @@ -18,6 +18,7 @@ sys.path.insert(0, _src_path) import asyncio +import httpx import pytest from unittest.mock import MagicMock, AsyncMock, patch, PropertyMock from datetime import datetime @@ -685,3 +686,168 @@ async def long_running(): assert task.cancelled() or task.done() assert len(service._active_poll_tasks) == 0 + + +class TestDescribeException: + """#1022: _describe_exception never persists a blank/whitespace error.""" + + def test_blank_stringifying_exception_falls_back_to_type_name(self): + # httpx timeout exceptions str() to '' — the exact #1022 trigger. + from scheduler.service import _describe_exception + + msg = _describe_exception(httpx.ReadTimeout("")) + assert msg.strip() # non-empty + assert "ReadTimeout" in msg + + def test_whitespace_only_exception_falls_back_to_type_name(self): + from scheduler.service import _describe_exception + + msg = _describe_exception(Exception(" ")) + assert msg.strip() + assert "Exception" in msg + + def test_normal_exception_message_preserved(self): + from scheduler.service import _describe_exception + + assert _describe_exception(Exception("boom")) == "boom" + + +class TestDispatchTimeout: + """#1022: dispatch POST timeout → descriptive raise + config-driven deadline.""" + + @pytest.mark.asyncio + async def test_dispatch_timeout_raises_descriptive_error( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + """A blank-stringifying httpx timeout on the POST becomes a descriptive, + non-empty Exception naming the threshold + subtype (was the silent bug).""" + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + execution = db_with_data.create_execution( + schedule_id="schedule-1", + agent_name="test-agent", + message="Test", + ) + + with patch("httpx.AsyncClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client.post.side_effect = httpx.ReadTimeout("") # str() == '' + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_cls.return_value = mock_client + + with pytest.raises(Exception) as exc_info: + await service._call_backend_execute_task( + agent_name="test-agent", + message="Test", + triggered_by="schedule", + execution_id=execution.id, + ) + + msg = str(exc_info.value) + assert "timed out" in msg + assert "30" in msg # the default dispatch_timeout value names the threshold + assert "ReadTimeout" in msg + + @pytest.mark.asyncio + async def test_dispatch_uses_config_timeout( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + """The dispatch deadline comes from config.dispatch_timeout, not a literal. + Complements test_dispatch_uses_short_timeout (30.0 default).""" + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "status": "accepted", + "execution_id": "exec-1", + "async_mode": True, + } + + execution = db_with_data.create_execution( + schedule_id="schedule-1", + agent_name="test-agent", + message="Test", + ) + db_with_data.update_execution_status( + execution_id=execution.id, + status=ExecutionStatus.SUCCESS, + ) + + with patch("httpx.AsyncClient") as mock_client_cls, \ + patch("scheduler.service.config.dispatch_timeout", 12.5): + mock_client = AsyncMock() + mock_client.post.return_value = mock_response + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_cls.return_value = mock_client + + await service._call_backend_execute_task( + agent_name="test-agent", + message="Test", + triggered_by="schedule", + execution_id=execution.id, + ) + + call_kwargs = mock_client.post.call_args + assert call_kwargs[1]["timeout"] == 12.5 + + +class TestDispatchTimeoutRegression: + """#1022 regression — the exact failure signature, INVERTED. + + A dispatch httpx timeout (str()=='') driven end-to-end must persist a + NON-EMPTY `error` on a FAILED row, instead of the silent `error=''`. + """ + + @pytest.mark.asyncio + async def test_dispatch_timeout_persists_nonempty_error( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + mock_lock = MagicMock() + mock_lock_manager.try_acquire_schedule_lock = MagicMock(return_value=mock_lock) + + with patch("httpx.AsyncClient") as mock_client_cls, \ + patch.object(service, "_publish_event", new_callable=AsyncMock): + mock_client = AsyncMock() + # Every POST (pre-check fail-open + dispatch) blank-stringifies. + mock_client.post.side_effect = httpx.ReadTimeout("") + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_cls.return_value = mock_client + + await service._execute_schedule_with_lock("schedule-1") + + with db_with_data.get_connection() as conn: + cursor = conn.cursor() + cursor.execute(""" + SELECT status, error FROM schedule_executions + WHERE schedule_id = 'schedule-1' + ORDER BY started_at DESC + LIMIT 1 + """) + row = cursor.fetchone() + + assert row is not None + assert row["status"] == ExecutionStatus.FAILED + # The #1022 signature inverted: error must NOT be blank. + assert row["error"] is not None and row["error"].strip() != "" + assert "timed out" in row["error"] diff --git a/tests/scheduler_tests/test_config.py b/tests/scheduler_tests/test_config.py index 2ca96304b..fdc53f670 100644 --- a/tests/scheduler_tests/test_config.py +++ b/tests/scheduler_tests/test_config.py @@ -26,6 +26,9 @@ def test_default_values(self): assert config.lock_auto_renewal is True assert config.health_port == 8001 assert config.default_timezone == "UTC" + # #1022: dispatch + pre-check deadlines lifted from literals to config. + assert config.dispatch_timeout == 30.0 + assert config.pre_check_timeout == 70.0 def test_env_override(self): """Test that environment variables override defaults.""" @@ -53,6 +56,18 @@ def test_agent_timeout(self): config = SchedulerConfig() assert config.agent_timeout == 1800.0 + def test_dispatch_timeout_override(self): + """#1022: DISPATCH_TIMEOUT overrides the dispatch deadline (float).""" + with patch.dict(os.environ, {"REDIS_URL": _TEST_REDIS_URL, "DISPATCH_TIMEOUT": "12.5"}, clear=True): + config = SchedulerConfig() + assert config.dispatch_timeout == 12.5 + + def test_pre_check_timeout_override(self): + """#1022: PRE_CHECK_TIMEOUT overrides the pre-check deadline (float).""" + with patch.dict(os.environ, {"REDIS_URL": _TEST_REDIS_URL, "PRE_CHECK_TIMEOUT": "90"}, clear=True): + config = SchedulerConfig() + assert config.pre_check_timeout == 90.0 + def test_publish_events_default(self): """Test event publishing is enabled by default.""" with patch.dict(os.environ, {"REDIS_URL": _TEST_REDIS_URL}, clear=True): From a97110f11046ec81f4965cd54e2e116e0e70a1d1 Mon Sep 17 00:00:00 2001 From: Andrii Pasternak Date: Sun, 14 Jun 2026 00:50:31 +0100 Subject: [PATCH 2/9] test(scheduler): regression for retry-path dispatch-timeout error (#1022) `_execute_retry` is the second persistence site the #1022 fix touched. Add TestRetryDispatchTimeoutRegression proving a blank-stringifying httpx.ReadTimeout('') raised during a retry dispatch lands a non-empty `error` on the retry's FAILED row (the #1022 signature, inverted). Refs #1022 Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/scheduler_tests/test_async_dispatch.py | 54 ++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/scheduler_tests/test_async_dispatch.py b/tests/scheduler_tests/test_async_dispatch.py index ce2d25d8d..57bc59c9b 100644 --- a/tests/scheduler_tests/test_async_dispatch.py +++ b/tests/scheduler_tests/test_async_dispatch.py @@ -851,3 +851,57 @@ async def test_dispatch_timeout_persists_nonempty_error( # The #1022 signature inverted: error must NOT be blank. assert row["error"] is not None and row["error"].strip() != "" assert "timed out" in row["error"] + + +class TestRetryDispatchTimeoutRegression: + """#1022 — the second persistence path the fix touched: `_execute_retry`. + + A blank-stringifying timeout raised by the backend dispatch during a + retry attempt must land a NON-EMPTY `error` on the retry's FAILED row + (service.py:`error=_describe_exception(e)[:2000]`), not the silent + `error=str(e)=''` it replaced. + """ + + @pytest.mark.asyncio + async def test_retry_dispatch_timeout_persists_nonempty_error( + self, + db_with_data: SchedulerDatabase, + mock_lock_manager: LockManager, + ): + service = SchedulerService( + database=db_with_data, + lock_manager=mock_lock_manager, + ) + + # The dispatch inside the retry blank-stringifies (the #1022 trigger). + service._call_backend_execute_task = AsyncMock( + side_effect=httpx.ReadTimeout("") + ) + + await service._execute_retry( + original_execution_id="orig-exec", + failed_execution_id="failed-exec", + schedule_id="schedule-1", + agent_name="test-agent", + message="Retry me", + timeout_seconds=None, + model="claude-sonnet-4-6", + allowed_tools=[], + next_attempt_number=2, + ) + + with db_with_data.get_connection() as conn: + cursor = conn.cursor() + cursor.execute(""" + SELECT status, error FROM schedule_executions + WHERE schedule_id = 'schedule-1' AND triggered_by = 'retry' + ORDER BY started_at DESC + LIMIT 1 + """) + row = cursor.fetchone() + + assert row is not None, "retry execution row was never created" + assert row["status"] == ExecutionStatus.FAILED + # #1022 inverted: a blank-stringifying timeout must NOT persist blank. + assert row["error"] is not None and row["error"].strip() != "" + assert "ReadTimeout" in row["error"] From 21553bb0059ef0d0980212b93bd24145d95b5f5a Mon Sep 17 00:00:00 2001 From: Andrii Pasternak Date: Sun, 14 Jun 2026 00:50:36 +0100 Subject: [PATCH 3/9] test(scheduler): pre-check timeout is config-driven + fails open on blank (#1022) Add TestRunPreCheckTimeout: the pre-check POST deadline now comes from config.pre_check_timeout (was a 70.0 literal), and a blank-stringifying httpx.ReadTimeout('') still fails open (returns None) rather than propagating. Refs #1022 Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/scheduler_tests/test_pre_check.py | 35 +++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/scheduler_tests/test_pre_check.py b/tests/scheduler_tests/test_pre_check.py index 83e8ab2b6..ae7c8027a 100644 --- a/tests/scheduler_tests/test_pre_check.py +++ b/tests/scheduler_tests/test_pre_check.py @@ -22,6 +22,7 @@ from unittest.mock import AsyncMock, MagicMock, patch +import httpx import pytest from scheduler.models import ExecutionStatus @@ -134,6 +135,40 @@ async def test_malformed_json_returns_none(self, db_with_data): assert decision is None +class TestRunPreCheckTimeout: + """#1022: the pre-check POST deadline is config-driven (was a 70.0 literal), + and a blank-stringifying timeout — the exact #1022 trigger — still fails + open instead of raising. (TestRunPreCheckTranslation already covers the + fail-open path with a *non-blank* exception; this pins the blank case.)""" + + @pytest.mark.asyncio + async def test_post_uses_configured_pre_check_timeout(self, db_with_data): + """The POST timeout comes from config.pre_check_timeout, not a literal.""" + svc = _build_svc(db_with_data) + + response = MagicMock() + response.status_code = 200 + response.json = MagicMock(return_value={"hook_present": False}) + post = AsyncMock(return_value=response) + client_ctx = AsyncMock() + client_ctx.__aenter__.return_value.post = post + + with patch("scheduler.service.httpx.AsyncClient", return_value=client_ctx), \ + patch("scheduler.service.config.pre_check_timeout", 42.0): + await svc._run_pre_check("test-agent") + + assert post.call_args.kwargs["timeout"] == 42.0 + + @pytest.mark.asyncio + async def test_blank_stringifying_timeout_fails_open(self, db_with_data): + """A blank httpx timeout (str()=='') must fail open (return None), + never propagate — fail-open is structural for the pre-check gate.""" + svc = _build_svc(db_with_data) + with _mock_httpx_post(raise_exc=httpx.ReadTimeout("")): + decision = await svc._run_pre_check("test-agent") + assert decision is None + + # --------------------------------------------------------------------------- # SchedulerService._execute_schedule_with_lock pre-check branch # --------------------------------------------------------------------------- From 394429307bf69a66dcd120edc604ef540d5c1830 Mon Sep 17 00:00:00 2001 From: Andrii Pasternak Date: Sun, 14 Jun 2026 00:50:40 +0100 Subject: [PATCH 4/9] docs(scheduler): document dispatch-timeout error persistence (#1022) Update scheduler feature flows for the #1022 fix: new DISPATCH_TIMEOUT / PRE_CHECK_TIMEOUT config rows, the named non-blank dispatch-timeout raise plus the _describe_exception() normalizer, the error-handling matrix entries, the test catalog, the change history, and a Recent Updates index row. Refs #1022 Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/memory/feature-flows.md | 1 + docs/memory/feature-flows/scheduler-pre-check.md | 2 +- docs/memory/feature-flows/scheduler-service.md | 13 ++++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/memory/feature-flows.md b/docs/memory/feature-flows.md index d78d6df41..47317d3ec 100644 --- a/docs/memory/feature-flows.md +++ b/docs/memory/feature-flows.md @@ -11,6 +11,7 @@ | Date | ID | Feature | Flow | |------|-----|---------|------| +| 2026-06-14 | #1022 | fix(scheduler): persist a descriptive `error` on dispatch timeout — a dispatch `httpx.TimeoutException` (whose `str()` is `''`) previously landed in the cron path's generic handler and persisted a **blank** `error`. Now re-raised before that handler as a named non-blank message (`"dispatch to /api/internal/execute-task timed out after {N}s — outcome unknown"`); outcome is genuinely UNKNOWN (backend spawns the bg task before replying → may already be running → orphan recovered by cleanup). New `_describe_exception()` helper (type-name fallback) normalizes any blank-stringifying exception across all execution/retry/process-schedule error paths. Dispatch + pre-check HTTP deadlines lifted from literals to config: `DISPATCH_TIMEOUT` (default 30s) and `PRE_CHECK_TIMEOUT` (default 70s). Scheduler-only (`src/scheduler/`); +270 lines of tests (incl. pre-check config-deadline + retry-path blank-error regressions). | [scheduler-service.md](feature-flows/scheduler-service.md), [scheduler-pre-check.md](feature-flows/scheduler-pre-check.md) | | 2026-06-10 | #1130 | fix: retired `gemini-2.0-flash` replaced with env-configurable models — `GEMINI_TEXT_MODEL` (image-gen prompt refinement) + `GEMINI_TRANSCRIPTION_MODEL` (Telegram voice), both default `gemini-3.5-flash`, defined in `config.py`, empty-string-safe wiring in both compose files (#1076 pattern). | [image-generation.md](feature-flows/image-generation.md), [telegram-integration.md](feature-flows/telegram-integration.md) | | 2026-06-10 | #1108 | feat(ui): Agent Detail **Guardrails** tab renamed to **Settings** — sectioned config home. New `components/settings/SettingsPanel.vue` renders `GuardrailsPanel` unchanged as section #1; future per-agent settings land as additive sections, not new tabs. `?tab=guardrails` deep links alias to `settings` via `TAB_ALIASES`. Pure frontend. | [agent-guardrails.md](feature-flows/agent-guardrails.md) | | 2026-06-10 | #1114 | feat(ui): Agent Detail tabs overflow into a **"More ▾"** dropdown instead of horizontal scroll. New reusable `components/OverflowTabs.vue` ("priority+" pattern): a hidden, zero-layout mirror row measures every `{id,label,badge?}` tab's width (+ a worst-case "More" button) so the visible row renders as many tabs as fit and collapses the trailing remainder into a right-aligned disclosure menu. Re-measures on container resize (`ResizeObserver` on the outer wrapper, width-diff-guarded + rAF-debounced) and after `document.fonts.ready`; re-measures on tab/label/badge changes via a derived-signature `watch` (`flush:'post'`). Defaults to all-inline before the first measure (no first-paint snap; no "More" when everything fits). Active-in-overflow reflected on the trigger (active underline + dot), tab order never reshuffled. Plain `