deeppavlov · voorhs · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/NEW_DOMAIN.md b/NEW_DOMAIN.md
@@ -72,7 +72,7 @@
 - **`run(domain, experiment_name=...)`**  
   - Enters `domain` (domain `setup` → connect MCP → `domain.toolset` available).  
   - Converts `domain.tasks()` to a pydantic_evals `Dataset` (each task → `Case` with `inputs=task`, `evaluators=task.evaluators`).  
-  - For each case, runs `task_lifecycle` (enter task → run agent with `domain.toolset` → run evaluators) then exits task.  
+  - For each case, pydantic_evals `CaseLifecycle` enters the task (enter task → run agent with `domain.toolset` → run evaluators) then `teardown` exits the task.  
   - Exits domain (disconnect MCP, domain `teardown`).
 
 So: **domain** = MCP env + list of tasks; **task** = prompt + evaluators + optional per-task setup/teardown; **evaluators** = reusable (common_evaluators) or task-specific (custom_evaluators) checks that use `ctx.inputs` (the task instance) to inspect the environment.

diff --git a/README.md b/README.md
@@ -479,7 +479,7 @@ sequenceDiagram
     R->>D: domain.tasks()
 
     loop For each task
-        Note over R: case_context_manager enters task context
+        Note over R: CaseLifecycle.setup enters task context
         R->>R: task.setup()
         R->>A: agent.run(task.goal, toolsets=[domain.toolset])
         A->>LF: Log agent span
@@ -492,7 +492,7 @@ sequenceDiagram
         E->>LF: Log evaluator span
         E->>MCP: Check environment state
         E-->>R: EvaluatorOutput
-        Note over R: case_context_manager exits task context
+        Note over R: CaseLifecycle.teardown exits task context
         R->>R: task stack cleanup
     end
 
@@ -526,7 +526,7 @@ flowchart LR
     end
 ```
 
-NOTE: `case_context_manager` is not an official feature, but is implemented in [our fork](https://github.com/voorhs/pydantic-ai/tree/f/case-context-manager). See PR: https://github.com/pydantic/pydantic-ai/pull/4155.
+Per-case resource span (task entered before the agent run and closed after evaluators) uses pydantic_evals [`CaseLifecycle`](https://ai.pydantic.dev/evals/) with an internal `AsyncExitStack`.
 
 ### Scope Lifecycle
 
@@ -535,11 +535,11 @@ The library manages resource lifecycle at two levels:
 | Scope | Managed By | Lifecycle | Resources |
 |-------|------------|-----------|-----------|
 | Domain | `async with domain:` | Per domain | MCP connections, CombinedToolset, domain-level fixtures |
-| Task | `case_context_manager` | Per task execution | Temp files, env vars, task-level fixtures |
+| Task | `CaseLifecycle` (`Dataset.evaluate`, `lifecycle=...`) | Per task execution | Temp files, env vars, task-level fixtures |
 
 **Domain context managers** handle MCP server connections (via `pydantic_ai.CombinedToolset`) and domain-level setup via `setup(stack)` with stack-based cleanup.
 
-**Task context managers** handle task-specific setup via `setup(stack)` (fixtures, environment). The task context is managed via `case_context_manager` parameter passed to `dataset.evaluate()`, ensuring it spans both:
+**Task context managers** handle task-specific setup via `setup(stack)` (fixtures, environment). The runner passes a `CaseLifecycle` subclass to `dataset.evaluate(lifecycle=...)`, ensuring the task context spans both:
 - Task execution (agent.run)
 - Evaluator execution (evaluator.evaluate)
 
@@ -579,7 +579,7 @@ mcp-evals/
 
 ## Dependencies
 
-- **[pydantic-ai](https://ai.pydantic.dev/)** — LLM provider abstraction + MCP client (uses [our fork](https://github.com/voorhs/pydantic-ai) with pydantic_evals)
+- **[pydantic-ai](https://ai.pydantic.dev/)** — LLM provider abstraction + MCP client (bundles pydantic_evals)
 - **[pydantic-settings](https://docs.pydantic.dev/latest/concepts/pydantic_settings/)** — Environment-based secrets management
 - **[loguru](https://github.com/Delgan/loguru)** — Logging
 - **[logfire](https://pydantic.dev/logfire)** — Optional observability (via pydantic-ai extra)

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
 requires-python = ">=3.13"
 dependencies = [
     "anyio>=4.0,<5.0",
-    "pydantic-ai",
+    "pydantic-ai>=1.73,<2.0",
     "pydantic-settings>=2.0,<3.0",
     "loguru>=0.7,<1.0"
 ]
@@ -39,9 +39,6 @@ dev = [
     "types-tqdm>=4.67.0.20250809,<5.0.0.0",
 ]
 
-[tool.uv.sources]
-pydantic-ai = { git = "https://github.com/voorhs/pydantic-ai", rev = "0772226" }
-
 [build-system]
 requires = ["uv_build>=0.8.8,<0.9.0"]
 build-backend = "uv_build"

diff --git a/src/mcp_evals/_internal/evaluated_fn.py b/src/mcp_evals/_internal/evaluated_fn.py
@@ -55,7 +55,8 @@ async def run_agent_on_task(
     manager returned by deps_maker(task); that CM is entered and exited
     for each task so each task gets fresh deps.
 
-    Note: Task context (setup/teardown) is managed by `case_context_manager`,
+    Note: Task context (setup/teardown) is managed by pydantic_evals
+    `pydantic_evals.lifecycle.CaseLifecycle` (registered on `Dataset.evaluate`),
     not inside this function. This ensures evaluators can access task state
     before teardown runs.
 
@@ -92,7 +93,7 @@ async def run_agent_on_task_with_self_correction(
     augments the goal with the failure reasons and retries. Continues until all
     evaluators pass or max_retries is reached.
 
-    Task context (setup/teardown) is managed by case_context_manager, not here.
+    Task context (setup/teardown) is managed by CaseLifecycle, not here.
     Evaluators use ctx.inputs (task) and ctx.output (result); we use a minimal
     SimpleNamespace for ctx since EvaluatorContext has many internal fields.
     """

diff --git a/src/mcp_evals/_internal/runner/_domain_runner.py b/src/mcp_evals/_internal/runner/_domain_runner.py
@@ -163,7 +163,7 @@ async def _run_train_phase(
         await train_dataset.evaluate(
             evaluated_fn,
             max_concurrency=self.max_concurrency,
-            case_context_manager=make_task_lifecycle(state, split_idx, "train"),
+            lifecycle=make_task_lifecycle(state, split_idx, "train"),
             progress=False,
             name=f"{base_name}_train_{split_idx}_",
         )
@@ -191,7 +191,7 @@ async def _run_test_phase(
         return await test_dataset.evaluate(
             evaluated_fn,
             max_concurrency=self.max_concurrency,
-            case_context_manager=make_task_lifecycle(state, split_idx, "test"),
+            lifecycle=make_task_lifecycle(state, split_idx, "test"),
             progress=False,
             name=f"{base_name}_test_{split_idx}" if experiment_name else None,
         )

diff --git a/src/mcp_evals/_internal/runner/_utils.py b/src/mcp_evals/_internal/runner/_utils.py
@@ -1,13 +1,15 @@
 """Internal runner for executing domains and tasks."""
 
-from collections.abc import AsyncIterator, Callable
-from contextlib import asynccontextmanager
+from collections.abc import AsyncIterator
+from contextlib import AsyncExitStack, asynccontextmanager
 from typing import Any, Literal
 
 from loguru import logger
 from pydantic_ai.exceptions import UsageLimitExceeded
 from pydantic_ai.run import AgentRunResult
 from pydantic_evals import Case
+from pydantic_evals.lifecycle import CaseLifecycle
+from pydantic_evals.reporting import ReportCase, ReportCaseFailure
 
 from mcp_evals.task import Task
 from mcp_evals.types import DepsMaker
@@ -27,55 +29,57 @@ def default_deps_maker() -> DepsMaker:
     return lambda _task: _no_deps_cm()
 
 
-@asynccontextmanager
-async def task_lifecycle(case: Case[Task[Any, Any], AgentRunResult, None]) -> AsyncIterator[None]:
-    """Context manager that wraps task execution + evaluation.
-
-    This ensures the task context (setup/teardown) spans both:
-    - Task execution (agent.run)
-    - Evaluator execution (evaluator.evaluate)
-
-    This is critical because evaluators often need to check the environment
-    state (files, database, etc.) that was set up during task.setup(), and
-    this state must remain available until after evaluators complete.
-    """
-    task = case.inputs  # In mcp_evals, inputs IS the Task instance
-    async with task:
-        yield
+def _failure_is_usage_limit(result: ReportCaseFailure[Any, Any, Any]) -> bool:
+    """Detect usage-limit failures when only string error fields are available."""
+    name = UsageLimitExceeded.__name__
+    return name in result.error_message or name in result.error_stacktrace
 
 
 def make_task_lifecycle(
     state: RunState,
     split_idx: int,
     phase: Phase,
-) -> Callable[..., Any]:
-    """Return a context manager factory: callable(case) for use as case_context_manager.
+) -> type[CaseLifecycle[Task[Any, Any], AgentRunResult[Any], None]]:
+    """Return a ``CaseLifecycle`` subclass for ``Dataset.evaluate(..., lifecycle=...)``.
 
-    Marks task as finished in run state. For certain errors (e.g., ModelUsageExceeded),
-    marks as finished even though the task failed, suppressing the exception so it
-    doesn't propagate to pydantic_evals (which will still mark case as failed in reporting).
+    Enters the task async context in ``setup()`` (via :class:`~contextlib.AsyncExitStack`)
+    so it stays active through the evaluated function and evaluators; ``teardown()``
+    closes the stack and updates run state.
 
-    This distinction is important: some errors indicate the task *execution* completed
-    but was interrupted by external constraints (e.g., quota exceeded), so retrying
-    makes no sense. The task should be marked done in run state for resume purposes.
+    Marks the task finished in run state on success, or on usage-limit exhaustion
+    (retrying the same task would not help). Other failures do not mark finished
+    so resume can retry the task.
     """
 
-    @asynccontextmanager
-    async def _lifecycle(case: Case[Task[Any, Any], AgentRunResult, None]) -> AsyncIterator[None]:
-        task = case.inputs
-        try:
-            async with task:
-                yield
-        except Exception as e:
-            if isinstance(e, UsageLimitExceeded):
+    class McpTaskLifecycle(CaseLifecycle[Task[Any, Any], AgentRunResult[Any], None]):
+        def __init__(self, case: Case[Task[Any, Any], AgentRunResult[Any], None]) -> None:
+            super().__init__(case)
+            self._exit_stack: AsyncExitStack | None = None
+
+        async def setup(self) -> None:
+            self._exit_stack = AsyncExitStack()
+            task = self.case.inputs
+            await self._exit_stack.enter_async_context(task)
+
+        async def teardown(
+            self,
+            result: ReportCase[Task[Any, Any], AgentRunResult[Any], None]
+            | ReportCaseFailure[Task[Any, Any], AgentRunResult[Any], None],
+        ) -> None:
+            if self._exit_stack is not None:
+                await self._exit_stack.aclose()
+                self._exit_stack = None
+
+            task = self.case.inputs
+            if isinstance(result, ReportCase):
+                await state.mark_task_finished(split_idx, phase, task.name)
+                return
+
+            if _failure_is_usage_limit(result):
                 logger.exception(
-                    f"[{task.name}] Usage exceeded"
+                    f"[{task.name}] Usage exceeded. "
                     "Task will be marked as finished (not retried), but case marked as failed in reporting."
                 )
                 await state.mark_task_finished(split_idx, phase, task.name)
-            raise
-        else:
-            # Success path: no exception occurred
-            await state.mark_task_finished(split_idx, phase, task.name)
 
-    return _lifecycle
+    return McpTaskLifecycle