Skip to content

Commit 37bff73

Browse files
committed
fix(webapi): add grace period for transient error states
1 parent 1e179bf commit 37bff73

File tree

9 files changed

+121
-26
lines changed

9 files changed

+121
-26
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2626
- Fixed frequency accumulation of gradients for custom dispersive media.
2727
- Fixed `snap_box_to_grid` producing zero-size boxes when using `Expand` behavior with very small intervals centered on a grid point.
2828
- Fixed sliver polygon artifacts in 2D material subdivision by filtering polygons based on grid cell size, preventing numerical issues with large-coordinate geometries.
29+
- Fixed CLI monitoring raising fatal errors on transient backend error states during automatic retries.
2930

3031
## [2.10.2] - 2026-01-21
3132

tests/test_web/test_webapi.py

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Tests webapi and things that depend on it
22
from __future__ import annotations
33

4+
import json
45
import os
56
import posixpath
67
from concurrent.futures import Future
@@ -25,7 +26,7 @@
2526
from tidy3d.components.monitor import FieldMonitor
2627
from tidy3d.components.source.current import PointDipole
2728
from tidy3d.components.source.time import GaussianPulse
28-
from tidy3d.exceptions import SetupError
29+
from tidy3d.exceptions import SetupError, WebError
2930
from tidy3d.web import common
3031
from tidy3d.web.api.asynchronous import run_async
3132
from tidy3d.web.api.container import Batch, Job, WebContainer
@@ -41,6 +42,7 @@
4142
estimate_cost,
4243
get_info,
4344
get_run_info,
45+
get_status,
4446
get_tasks,
4547
load,
4648
load_simulation,
@@ -51,6 +53,7 @@
5153
)
5254
from tidy3d.web.core.environment import Env
5355
from tidy3d.web.core.exceptions import WebNotFoundError
56+
from tidy3d.web.core.task_info import TaskInfo
5457
from tidy3d.web.core.types import PayType, TaskType
5558

5659
TASK_NAME = "task_name_test"
@@ -272,7 +275,7 @@ def mock_monitor(monkeypatch):
272275
status_count = [0]
273276
statuses = ("upload", "running", "running", "running", "running", "running", "success")
274277

275-
def mock_get_status(task_id):
278+
def mock_get_status(task_id, **_kwargs):
276279
current_count = min(status_count[0], len(statuses) - 1)
277280
current_status = statuses[current_count]
278281
status_count[0] += 1
@@ -419,6 +422,61 @@ def test_get_run_info(mock_get_run_info, mock_get_info):
419422
assert get_run_info(TASK_ID) == (100, 0)
420423

421424

425+
def test_get_status_grace_period_recovers(monkeypatch):
426+
statuses = iter(["run_error", "run_error", "running"])
427+
428+
def mock_get_info(task_id):
429+
status = next(statuses, "running")
430+
return TaskInfo(taskId=task_id, status=status, taskType=TaskType.MODE.name)
431+
432+
time_state = {"t": 0.0}
433+
434+
def fake_monotonic():
435+
return time_state["t"]
436+
437+
def fake_sleep(seconds):
438+
time_state["t"] += seconds
439+
440+
monkeypatch.setattr(f"{api_path}.TaskFactory.get", lambda *_args, **_kwargs: None)
441+
monkeypatch.setattr(f"{api_path}.get_info", mock_get_info)
442+
monkeypatch.setattr(f"{api_path}.REFRESH_TIME", 0.01)
443+
monkeypatch.setattr(f"{api_path}.time.sleep", fake_sleep)
444+
monkeypatch.setattr(f"{api_path}.time.monotonic", fake_monotonic)
445+
446+
assert get_status(TASK_ID, error_grace_period=0.05) == "running"
447+
448+
449+
def test_get_status_grace_period_expires(monkeypatch):
450+
statuses = iter(["run_error", "run_error", "run_error"])
451+
452+
def mock_get_info(task_id):
453+
status = next(statuses, "run_error")
454+
return TaskInfo(taskId=task_id, status=status, taskType=TaskType.MODE.name)
455+
456+
def mock_get_error_json(self, to_file, **_kwargs):
457+
with open(to_file, "w", encoding="utf8") as handle:
458+
json.dump({"msg": "boom"}, handle)
459+
return Path(to_file)
460+
461+
time_state = {"t": 0.0}
462+
463+
def fake_monotonic():
464+
return time_state["t"]
465+
466+
def fake_sleep(seconds):
467+
time_state["t"] += seconds
468+
469+
monkeypatch.setattr(f"{api_path}.TaskFactory.get", lambda *_args, **_kwargs: None)
470+
monkeypatch.setattr(f"{api_path}.get_info", mock_get_info)
471+
monkeypatch.setattr(f"{api_path}.SimulationTask.get_error_json", mock_get_error_json)
472+
monkeypatch.setattr(f"{api_path}.REFRESH_TIME", 0.01)
473+
monkeypatch.setattr(f"{api_path}.time.sleep", fake_sleep)
474+
monkeypatch.setattr(f"{api_path}.time.monotonic", fake_monotonic)
475+
476+
with pytest.raises(WebError, match="boom"):
477+
get_status(TASK_ID, error_grace_period=0.02)
478+
479+
422480
@responses.activate
423481
def test_download(mock_download, tmp_path):
424482
download(TASK_ID, str(tmp_path / "web_test_tmp.json"))

tests/test_web/test_webapi_eme.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def mock_monitor(monkeypatch):
151151
status_count = [0]
152152
statuses = ("upload", "running", "running", "running", "running", "running", "success")
153153

154-
def mock_get_status(task_id):
154+
def mock_get_status(task_id, **_kwargs):
155155
current_count = min(status_count[0], len(statuses) - 1)
156156
current_status = statuses[current_count]
157157
status_count[0] += 1

tests/test_web/test_webapi_heat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def mock_monitor(monkeypatch):
148148
status_count = [0]
149149
statuses = ("upload", "running", "running", "running", "running", "running", "success")
150150

151-
def mock_get_status(task_id):
151+
def mock_get_status(task_id, **_kwargs):
152152
current_count = min(status_count[0], len(statuses) - 1)
153153
current_status = statuses[current_count]
154154
status_count[0] += 1

tests/test_web/test_webapi_mode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def mock_monitor(monkeypatch):
184184
status_count = [0]
185185
statuses = ("upload", "running", "running", "running", "running", "running", "success")
186186

187-
def mock_get_status(task_id):
187+
def mock_get_status(task_id, **_kwargs):
188188
current_count = min(status_count[0], len(statuses) - 1)
189189
current_status = statuses[current_count]
190190
status_count[0] += 1

tests/test_web/test_webapi_mode_sim.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def mock_monitor(monkeypatch):
180180
status_count = [0]
181181
statuses = ("upload", "running", "running", "running", "running", "running", "success")
182182

183-
def mock_get_status(task_id):
183+
def mock_get_status(task_id, **_kwargs):
184184
current_count = min(status_count[0], len(statuses) - 1)
185185
current_status = statuses[current_count]
186186
status_count[0] += 1

tidy3d/config/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ flowchart LR
5050

5151
## Module Reference
5252

53-
- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`.
53+
- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`. `web.monitor_error_grace_period` controls how long `web.monitor()` waits through transient error states before raising.
5454
- `registry.py` - Stores section and handler registries and notifies the attached manager so new entries appear immediately.
5555
- `manager.py` - `ConfigManager` caches validated models, tracks runtime overrides per profile, filters persisted fields, exposes helpers such as `plugins`, `profiles`, and `format`. `SectionAccessor` routes attribute access to `update_section`.
5656
- `loader.py` - Resolves the config directory, loads `config.toml` and `profiles/<name>.toml`, parses environment overrides, and writes atomically through `serializer.build_document`.

tidy3d/config/sections.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,15 @@ class WebConfig(ConfigSection):
363363
le=300,
364364
)
365365

366+
monitor_error_grace_period: NonNegativeFloat = Field(
367+
60.0,
368+
title="Monitor error grace period",
369+
description=(
370+
"Seconds to wait out transient error statuses during web.monitor() "
371+
"before raising an error."
372+
),
373+
)
374+
366375
ssl_version: Optional[str] = Field(
367376
None,
368377
title="SSL/TLS version",

tidy3d/web/api/webapi.py

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -732,22 +732,50 @@ def _get_batch_detail_handle_error_status(batch: BatchTask) -> BatchDetail:
732732
return detail
733733

734734

735-
def get_status(task_id: TaskId) -> str:
735+
def get_status(task_id: TaskId, *, error_grace_period: float = 0.0) -> str:
736736
"""Get the status of a task. Raises an error if status is "error".
737737
738738
Parameters
739739
----------
740740
task_id : str
741741
Unique identifier of task on server. Returned by :meth:`upload`.
742+
error_grace_period : float = 0.0
743+
Seconds to wait out transient error statuses before raising an error.
742744
"""
745+
746+
def _wait_out_error(fetch_status: Callable[[], str], raw_status: str | None) -> str | None:
747+
if error_grace_period <= 0:
748+
return raw_status
749+
deadline = time.monotonic() + error_grace_period
750+
status = (raw_status or "").lower()
751+
while status in ERROR_STATES and time.monotonic() < deadline:
752+
time.sleep(REFRESH_TIME)
753+
raw_status = fetch_status()
754+
status = (raw_status or "").lower()
755+
return raw_status
756+
743757
task = TaskFactory.get(task_id)
744758
if isinstance(task, BatchTask):
745-
return _get_batch_detail_handle_error_status(task).status
759+
detail = task.detail()
760+
raw_status = detail.status
761+
status = (raw_status or "").lower()
762+
if status in ERROR_STATES:
763+
raw_status = _wait_out_error(lambda: task.detail().status, raw_status)
764+
status = (raw_status or "").lower()
765+
if status in ERROR_STATES:
766+
_batch_detail_error(task.task_id)
767+
return raw_status
746768
else:
747769
task_info = get_info(task_id)
748-
status = task_info.status
770+
raw_status = task_info.status
771+
status = (raw_status or "").lower()
749772
if status == "visualize":
750773
return "success"
774+
if status in ERROR_STATES:
775+
raw_status = _wait_out_error(lambda: get_info(task_id).status, raw_status)
776+
status = (raw_status or "").lower()
777+
if status == "visualize":
778+
return "success"
751779
if status in ERROR_STATES:
752780
try:
753781
# Try to obtain the error message
@@ -762,7 +790,7 @@ def get_status(task_id: TaskId) -> str:
762790
error_msg = "Error message could not be obtained, please contact customer support."
763791

764792
raise WebError(f"Error running task {task_id}! {error_msg}")
765-
return status
793+
return raw_status
766794

767795

768796
def monitor(task_id: TaskId, verbose: bool = True, worker_group: Optional[str] = None) -> None:
@@ -823,18 +851,21 @@ def get_estimated_cost() -> float:
823851
est_flex_unit = task_info.estFlexUnit
824852
return est_flex_unit
825853

854+
def _get_status() -> str:
855+
return get_status(task_id, error_grace_period=config.web.monitor_error_grace_period)
856+
826857
def monitor_preprocess() -> None:
827858
"""Periodically check the status."""
828-
status = get_status(task_id)
859+
status = _get_status()
829860
while status not in END_STATES and status != "running":
830-
new_status = get_status(task_id)
861+
new_status = _get_status()
831862
if new_status != status:
832863
status = new_status
833864
if verbose and status != "running":
834865
console.log(f"status = {status}")
835866
time.sleep(REFRESH_TIME)
836867

837-
status = get_status(task_id)
868+
status = _get_status()
838869

839870
if verbose:
840871
console.log(f"status = {status}")
@@ -861,7 +892,7 @@ def monitor_preprocess() -> None:
861892
console.log("starting up solver")
862893

863894
# while running but before the percentage done is available, keep waiting
864-
while get_run_info(task_id)[0] is None and get_status(task_id) == "running":
895+
while get_run_info(task_id)[0] is None and _get_status() == "running":
865896
time.sleep(REFRESH_TIME)
866897

867898
# while running but percentage done is available
@@ -873,9 +904,7 @@ def monitor_preprocess() -> None:
873904
pbar_pd = progress.add_task("% done", total=100)
874905
perc_done, _ = get_run_info(task_id)
875906

876-
while (
877-
perc_done is not None and perc_done < 100 and get_status(task_id) == "running"
878-
):
907+
while perc_done is not None and perc_done < 100 and _get_status() == "running":
879908
perc_done, field_decay = get_run_info(task_id)
880909
new_description = f"solver progress (field decay = {field_decay:.2e})"
881910
progress.update(pbar_pd, completed=perc_done, description=new_description)
@@ -892,9 +921,7 @@ def monitor_preprocess() -> None:
892921
pbar_pd = progress.add_task("% done", total=100)
893922
perc_done, _ = get_run_info(task_id)
894923

895-
while (
896-
perc_done is not None and perc_done < 100 and get_status(task_id) == "running"
897-
):
924+
while perc_done is not None and perc_done < 100 and _get_status() == "running":
898925
perc_done, _ = get_run_info(task_id)
899926
new_description = "solver progress"
900927
progress.update(pbar_pd, completed=perc_done, description=new_description)
@@ -904,26 +931,26 @@ def monitor_preprocess() -> None:
904931
new_description = "solver progress"
905932
progress.update(pbar_pd, completed=100, refresh=True, description=new_description)
906933
else:
907-
while get_status(task_id) == "running":
934+
while _get_status() == "running":
908935
perc_done, _ = get_run_info(task_id)
909936
time.sleep(RUN_REFRESH_TIME)
910937

911938
else:
912939
# non-verbose case, just keep checking until status is not running or perc_done >= 100
913940
perc_done, _ = get_run_info(task_id)
914-
while perc_done is not None and perc_done < 100 and get_status(task_id) == "running":
941+
while perc_done is not None and perc_done < 100 and _get_status() == "running":
915942
perc_done, field_decay = get_run_info(task_id)
916943
time.sleep(RUN_REFRESH_TIME)
917944

918945
# post processing
919946
if verbose:
920-
status = get_status(task_id)
947+
status = _get_status()
921948
if status != "running":
922949
console.log(f"status = {status}")
923950

924951
with console.status(f"[bold green]Finishing '{task_name}'...", spinner="runner"):
925952
while status not in END_STATES:
926-
new_status = get_status(task_id)
953+
new_status = _get_status()
927954
if new_status != status:
928955
status = new_status
929956
console.log(f"status = {status}")
@@ -933,7 +960,7 @@ def monitor_preprocess() -> None:
933960
url = _get_url(task_id)
934961
console.log(f"View simulation result at [blue underline][link={url}]'{url}'[/link].")
935962
else:
936-
while get_status(task_id) not in END_STATES:
963+
while _get_status() not in END_STATES:
937964
time.sleep(REFRESH_TIME)
938965

939966

0 commit comments

Comments
 (0)