Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/openrange/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
AdvanceRequest,
AgentTurn,
EpisodeCheckpoint,
EpisodeCost,
EpisodeError,
EpisodeHandle,
EpisodeReport,
Expand Down Expand Up @@ -90,6 +91,7 @@
"EdgeKind",
"EpisodeCheckpoint",
"EpisodeContext",
"EpisodeCost",
"EpisodeError",
"EpisodeHandle",
"EpisodeReport",
Expand Down
32 changes: 32 additions & 0 deletions src/openrange/core/episode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import atexit
import contextlib
import threading
import time
import uuid
import weakref
from collections.abc import Callable, Mapping
Expand Down Expand Up @@ -98,6 +99,17 @@ class EpisodeUpdate:
terminal_reason: str | None = None


@dataclass(frozen=True, slots=True)
class EpisodeCost:
"""What one episode cost to run. ``wall_seconds`` spans realize → grade;
``realize_seconds`` is the setup portion (realize + reset). Token usage
is not yet attributed — it needs per-backend usage reporting."""

wall_seconds: float = 0.0
realize_seconds: float = 0.0
turns: int = 0


@dataclass(frozen=True, slots=True)
class EpisodeReport:
"""Terminal artifact from a stopped episode. Implements `EpisodeReportLike`."""
Expand All @@ -107,6 +119,7 @@ class EpisodeReport:
episode_result: EpisodeResult
final_state: Mapping[str, Any] = field(default_factory=dict)
agent_summary: str = ""
cost: EpisodeCost = field(default_factory=EpisodeCost)

@property
def passed(self) -> bool:
Expand All @@ -123,6 +136,11 @@ def as_dict(self) -> dict[str, Any]:
},
"final_state": dict(self.final_state),
"agent_summary": self.agent_summary,
"cost": {
"wall_seconds": self.cost.wall_seconds,
"realize_seconds": self.cost.realize_seconds,
"turns": self.cost.turns,
},
}


Expand Down Expand Up @@ -154,6 +172,10 @@ class _RunningEpisode:
tick_stop: threading.Event | None = None
npcs: list[NPC] = field(default_factory=list)
stopped: bool = False
started_at: float = 0.0
realized_at: float = 0.0
stopped_at: float = 0.0
turns: int = 0


class EpisodeService:
Expand Down Expand Up @@ -217,6 +239,7 @@ def start_episode(
)
episode_root.mkdir(parents=True)

started_at = time.perf_counter()
runtime = self.pack.realize(snapshot.graph, self.backing)
try:
runtime.reset()
Expand All @@ -235,6 +258,8 @@ def start_episode(
run_root=episode_root,
surface_cache=surface_mapping,
dashboard=self.dashboard,
started_at=started_at,
realized_at=time.perf_counter(),
)
self._episodes[handle.id] = running
self._record_system(
Expand Down Expand Up @@ -276,6 +301,7 @@ def stop_episode(self, episode: EpisodeHandle) -> EpisodeReport:
running.final_state = final_state
episode_result = self._check_success(running, final_state)
running.episode_result = episode_result
running.stopped_at = time.perf_counter()
try:
running.runtime.stop()
except Exception as exc: # noqa: BLE001
Expand Down Expand Up @@ -344,6 +370,7 @@ def record_turn(self, episode: EpisodeHandle, turn: AgentTurn) -> None:
"""Observational breadcrumb. The latest non-empty `message` lands
in `EpisodeReport.agent_summary`."""
running = self._require(episode)
running.turns += 1
if turn.message:
running.agent_summary = turn.message

Expand Down Expand Up @@ -466,6 +493,11 @@ def _cached_report(self, running: _RunningEpisode) -> EpisodeReport:
episode_result=running.episode_result,
final_state=running.final_state,
agent_summary=running.agent_summary,
cost=EpisodeCost(
wall_seconds=running.stopped_at - running.started_at,
realize_seconds=running.realized_at - running.started_at,
turns=running.turns,
),
)

def _check_success(
Expand Down
46 changes: 46 additions & 0 deletions tests/test_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,52 @@ def solve(ctx: EpisodeContext) -> AgentTurn:
assert ep.report.agent_summary.startswith("http://")


class TestEpisodeCost:
def test_counts_recorded_turns(self, snapshot: Snapshot, tmp_path: Path) -> None:
run = OpenRangeRun(RunConfig(tmp_path, dashboard=False))

def solve(ctx: EpisodeContext) -> list[AgentTurn]:
_write_reference(ctx)
return [AgentTurn(message="a"), AgentTurn(message="b")]

ep = run.run_episode(snapshot, solve, task_id=_build_task_id(snapshot))
assert ep.report.cost.turns == 2

def test_noop_solver_costs_zero_turns(
self, snapshot: Snapshot, tmp_path: Path
) -> None:
run = OpenRangeRun(RunConfig(tmp_path, dashboard=False))
ep = run.run_episode(
snapshot, lambda ctx: None, task_id=_build_task_id(snapshot)
)
assert ep.report.cost.turns == 0

def test_timing_invariants(self, snapshot: Snapshot, tmp_path: Path) -> None:
run = OpenRangeRun(RunConfig(tmp_path, dashboard=False))

def solve(ctx: EpisodeContext) -> AgentTurn:
_write_reference(ctx)
return AgentTurn(message="ok")

ep = run.run_episode(snapshot, solve, task_id=_build_task_id(snapshot))
cost = ep.report.cost
assert cost.wall_seconds >= cost.realize_seconds >= 0.0

def test_cost_serialized_in_as_dict(
self, snapshot: Snapshot, tmp_path: Path
) -> None:
run = OpenRangeRun(RunConfig(tmp_path, dashboard=False))

def solve(ctx: EpisodeContext) -> AgentTurn:
_write_reference(ctx)
return AgentTurn(message="ok")

ep = run.run_episode(snapshot, solve, task_id=_build_task_id(snapshot))
cost = ep.report.as_dict()["cost"]
assert set(cost) == {"wall_seconds", "realize_seconds", "turns"}
assert cost["turns"] == 1


def _backing_manifest(backing: str | None) -> dict[str, object]:
manifest: dict[str, object] = {
"world": {"goal": "backing selection"},
Expand Down
Loading